date:20110927

[PATCH v4] kvm tools: Support multiple net devices

2011-09-27 Thread Sasha Levin

This patch adds support for multiple network devices. The command line syntax
changes to the following:

--network/-n [mode=tap/user/none][,guest_ip=ip][,host_ip=
ip][,guest_mac=mac][,script=file]

Each of the parameters is optional, and the config defaults to a TAP based
networking with a sequential MAC.

Signed-off-by: Sasha Levin 
---
 tools/kvm/builtin-run.c|  147 -
 tools/kvm/include/kvm/virtio-net.h |4 +-
 tools/kvm/virtio/net.c |  159 
 3 files changed, 197 insertions(+), 113 deletions(-)

diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c
index 28dc95a..a88503f 100644
--- a/tools/kvm/builtin-run.c
+++ b/tools/kvm/builtin-run.c
@@ -65,6 +65,7 @@ __thread struct kvm_cpu *current_kvm_cpu;
 
 static u64 ram_size;
 static u8  image_count;
+static u8 num_net_devices;
 static bool virtio_rng;
 static const char *kernel_cmdline;
 static const char *kernel_filename;
@@ -80,6 +81,7 @@ static const char *guest_mac;
 static const char *host_mac;
 static const char *script;
 static const char *guest_name;
+static struct virtio_net_params *net_params;
 static bool single_step;
 static bool readonly_image[MAX_DISK_IMAGES];
 static bool vnc;
@@ -87,6 +89,7 @@ static bool sdl;
 static bool balloon;
 static bool using_rootfs;
 static bool custom_rootfs;
+static bool no_net;
 extern bool ioport_debug;
 extern int  active_console;
 extern int  debug_iodelay;
@@ -182,6 +185,90 @@ static int tty_parser(const struct option *opt, const char 
*arg, int unset)
return 0;
 }
 
+static inline void str_to_mac(const char *str, char *mac)
+{
+   sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+   mac, mac+1, mac+2, mac+3, mac+4, mac+5);
+}
+static int set_net_param(struct virtio_net_params *p, const char *param,
+   const char *val)
+{
+   if (strcmp(param, "guest_mac") == 0) {
+   str_to_mac(val, p->guest_mac);
+   } else if (strcmp(param, "mode") == 0) {
+   if (!strncmp(val, "user", 4)) {
+   int i;
+
+   for (i = 0; i < num_net_devices; i++)
+   if (net_params[i].mode == NET_MODE_USER)
+   die("Only one usermode network device 
allowed at a time");
+   p->mode = NET_MODE_USER;
+   } else if (!strncmp(val, "tap", 3)) {
+   p->mode = NET_MODE_TAP;
+   } else if (!strncmp(val, "none", 4)) {
+   no_net = 1;
+   return -1;
+   } else
+   die("Unkown network mode %s, please use user, tap or 
none", network);
+   } else if (strcmp(param, "script") == 0) {
+   p->script = strdup(val);
+   } else if (strcmp(param, "guest_ip") == 0) {
+   p->guest_ip = strdup(val);
+   } else if (strcmp(param, "host_ip") == 0) {
+   p->host_ip = strdup(val);
+   }
+
+   return 0;
+}
+
+static int netdev_parser(const struct option *opt, const char *arg, int unset)
+{
+   struct virtio_net_params p;
+   char *buf = NULL, *cmd = NULL, *cur = NULL;
+   bool on_cmd = true;
+
+   if (arg) {
+   buf = strdup(arg);
+   if (buf == NULL)
+   die("Failed allocating new net buffer");
+   cur = strtok(buf, ",=");
+   }
+
+   p = (struct virtio_net_params) {
+   .guest_ip   = DEFAULT_GUEST_ADDR,
+   .host_ip= DEFAULT_HOST_ADDR,
+   .script = DEFAULT_SCRIPT,
+   .mode   = NET_MODE_TAP,
+   };
+
+   str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
+   p.guest_mac[5] += num_net_devices;
+
+   while (cur) {
+   if (on_cmd) {
+   cmd = cur;
+   } else {
+   if (set_net_param(&p, cmd, cur) < 0)
+   goto done;
+   }
+   on_cmd = !on_cmd;
+
+   cur = strtok(NULL, ",=");
+   };
+
+   num_net_devices++;
+
+   net_params = realloc(net_params, num_net_devices * sizeof(*net_params));
+   if (net_params == NULL)
+   die("Failed adding new network device");
+
+   net_params[num_net_devices - 1] = p;
+
+done:
+   free(buf);
+   return 0;
+}
+
 static int shmem_parser(const struct option *opt, const char *arg, int unset)
 {
const u64 default_size = SHMEM_DEFAULT_SIZE;
@@ -339,18 +426,9 @@ static const struct option options[] = {
"Kernel command line arguments"),
 
OPT_GROUP("Networking options:"),
-   OPT_STRING('n', "network", &network, "user, tap, none",
-   "Network to use"),
-   OPT_STRING('\0', "host-ip", &host_ip, "a.b.c.d",
-   "Assign this address to the host side networking")

Re: [Qemu-devel] RFC [v2]: vfio / device assignment -- layout of device fd files

2011-09-27 Thread Alex Williamson

On Tue, 2011-09-27 at 16:28 -0500, Scott Wood wrote:
> On 09/26/2011 07:45 PM, Alex Williamson wrote:
> > On Mon, 2011-09-26 at 18:59 -0500, Scott Wood wrote:
> >> On 09/26/2011 01:34 PM, Alex Williamson wrote:
> >>> /* Reset the device */
> >>> #define VFIO_DEVICE_RESET _IO(, ,)
> >>
> >> What generic way do we have to do this?  We should probably have a way
> >> to determine whether it's possible, without actually asking to do it.
> > 
> > It's not generic, it could be a VFIO_DEVICE_PCI_RESET or we could add a
> > bit to the device flags to indicate if it's available or we could add a
> > "probe" arg to the ioctl to either check for existence or do it.
> 
> Even with PCI, isn't this only possible if function-level reset is
> supported?

There are a couple other things we can do if FLR isn't present (D3hot
transition, secondary bus reset, device specific resets are possible).

>  I think we need a flag.

Ok, PCI has a pci_probe_reset_function() and pci_reset_function().  I'd
probably mimic those in the vfio device ops.  Common vfio code can probe
the reset and set the flag appropriately and we can have a common
VFIO_DEVICE_RESET ioctl that calls into the device ops reset function.

> For devices that can't be reset by the kernel, we'll want the ability to
> stop/start DMA acccess through the IOMMU (or other bus-specific means),
> separate from whether the fd is open.  If a device is assigned to a
> partition and that partition gets reset, we'll want to disable DMA
> before we re-use the memory, and enable it after the partition has reset
> or quiesced the device (which requires the fd to be open).

Maybe this can be accomplished via an iommu_detach_device() to
temporarily disassociate it from the domain.  We could also unmap all
the DMA.  Anyway, a few possibilities.

> >>> /* PCI MSI setup, arg[0] = #, arg[1-n] = eventfds */
> >>> #define VFIO_DEVICE_PCI_SET_MSI_EVENTFDS  _IOW(, , int)
> >>> #define VFIO_DEVICE_PCI_SET_MSIX_EVENTFDS _IOW(, , int)
> >>>
> >>> Hope that covers it.
> >>
> >> It could be done this way, but I predict that the code (both kernel and
> >> user side) will be larger.  Maybe not much more complex, but more
> >> boilerplate.
> >>
> >> How will you manage extensions to the interface?
> > 
> > I would assume we'd do something similar to the kvm capabilities checks.
> 
> This information is already built into the data-structure approach.

If we define it to be part of the flags, then it's built-in to the ioctl
approach too...

> >> The table should not be particularly large, and you'll need to keep the
> >> information around in some form regardless.  Maybe in the PCI case you
> >> could produce it dynamically (though I probably wouldn't), but it really
> >> wouldn't make sense in the device tree case.
> > 
> > It would be entirely dynamic for PCI, there's no advantage to caching
> > it.  Even for device tree, if you can't fetch it dynamically, you'd have
> > to duplicate it between an internal data structure and a buffer reading
> > the table.
> 
> I don't think we'd need to keep the device tree path/index info around
> for anything but the table -- but really, this is a minor consideration.
> 
> >> You also lose the ability to easily have a human look at the hexdump for
> >> debugging; you'll need a special "lsvfio" tool.  You might want one
> >> anyway to pretty-print the info, but with ioctls it's mandatory.
> > 
> > I don't think this alone justifies duplicating the data and making it
> > difficult to parse on both ends.  Chances are we won't need such a tool
> > for the ioctl interface because it's easier to get it right the first
> > time ;)
> 
> It's not just useful for getting the code right, but for e.g. sanity
> checking that the devices were bound properly.  I think such a tool
> would be generally useful, no matter what the kernel interface ends up
> being.  I don't just use lspci to debug the PCI subsystem. :-)

This is also a minor consideration.  Looking at hexdumps isn't much to
rely on for debugging and if we take the step of writing a tool, it's
not much harder to write for either interface.  The table is more akin
to dumping the data, but I feel the ioctl is easier for how a driver
would probably make use of the data (linear vs random access).

> > Note that I'm not stuck on this interface, I was just thinking about how
> > to generate the table last week, it seemed like a pain so I thought I'd
> > spend a few minutes outlining an ioctl interface... turns out it's not
> > so bad.  Thanks,
> 
> Yeah, it can work either way, as long as the information's there and
> there's a way to add new bits of information, or new bus types, down the
> road.  Mainly a matter of aesthetics between the two.

It'd be nice if David would chime back in since he objected to the
table.  Does an ioctl interface look better?  Alex Graf, any opinions?
Thanks,

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.ke

tcpdump locks up kvm host for a while.

2011-09-27 Thread Robin Lee Powell


I can essentially test this at will, so feel free to ask for
debugging steps.

My host and VMs are all Fedora 15.

Every time I run tcpdump on a VM, it hangs for a while.  It uses all
available CPU, "virsh list" hangs trying to show it, and I can't
shut it down except by killing the qemu-kvm process.

After a few minutes, which amount of time seems to be proportionate
to how long it's been since I last ran tcpdump (the longer it's
been, the longer I have to wait) everything goes back to normal if I
don't kill the VM.

Any idea what's going on there?

-Robin

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [kvm] Weird console issue.

2011-09-27 Thread Robin Lee Powell

On Mon, Sep 26, 2011 at 01:45:07AM -0700, Robin Lee Powell wrote:
> Everything seems to work until it asks for a password.  As soon as
> that happens, any character I type causes it to respond as though
> I'd hit "enter".  This continues until /bin/login completes and a
> new getty starts, at which point it prints the right headers and I
> can enter my userid, and then it asks for a password and it breaks
> again in the same way.

Bizarrely, the problem appears to have actually been in my kernel
config:

kernel /vmlinuz-2.6.38.8-32.fc15.x86_64 ro root=/dev/vda2 rd_NO_LUKS rd_NO_LVM 
rd_NO_MD rd_NO_DM LANG=en_US.UTF-8 SYSFONT=latarcyrheb-sun16 KEYTABLE=us 
console=ttyS0,115200 console=tty0 rhgb

Apparently, having the two console bits screwed things up, which is
kind of lame.

But hey, problem solved! :D

-Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

order 1 page allocation failures

2011-09-27 Thread Thomas Fjellstrom

Hi,

I've been having some issues with KVM recently where one or more vms will cause 
page allocation failure messages, usually with the backtrace including 
networking functions, example follows:

[362409.429944] kvm: page allocation failure: order:1, mode:0x20
[362409.429957] Pid: 3453, comm: kvm Not tainted 3.0.0-1-amd64 #1
[362409.429965] Call Trace:
[362409.429970][] ? warn_alloc_failed+0x108/0x11b
[362409.429998]  [] ? __alloc_pages_nodemask+0x6e6/0x75c
[362409.430012]  [] ? kmem_getpages+0x55/0xf0
[362409.430022]  [] ? fallback_alloc+0x129/0x1c1
[362409.430035]  [] ? paravirt_read_tsc+0x5/0x8
[362409.430045]  [] ? kmem_cache_alloc+0x73/0xf0
[362409.430057]  [] ? sk_prot_alloc+0x2b/0x128
[362409.430067]  [] ? sk_clone+0x14/0x2bd
[362409.430077]  [] ? inet_csk_clone+0x10/0x91
[362409.430088]  [] ? tcp_create_openreq_child+0x21/0x41a
[362409.430099]  [] ? tcp_v4_syn_recv_sock+0x33/0x208
[362409.430110]  [] ? tcp_check_req+0x1ff/0x2dd
[362409.430122]  [] ? inet_csk_search_req+0x35/0xa7
[362409.430132]  [] ? tcp_v4_do_rcv+0x206/0x32c
[362409.430144]  [] ? tcp_v4_rcv+0x419/0x66c
[362409.430154]  [] ? native_sched_clock+0x28/0x30
[362409.430173]  [] ? ip_local_deliver_finish+0x14b/0x1bb
[362409.430186]  [] ? __netif_receive_skb+0x3d7/0x40b
[362409.430197]  [] ? netif_receive_skb+0x52/0x58
[362409.430220]  [] ? br_nf_pre_routing_finish+0x1d4/0x1e1 
[bridge]
[362409.430241]  [] ? NF_HOOK_THRESH+0x3b/0x55 [bridge]
[362409.430260]  [] ? br_nf_pre_routing+0x3be/0x3cb [bridge]
[362409.430272]  [] ? nf_iterate+0x41/0x77
[362409.430288]  [] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[362409.430305]  [] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[362409.430315]  [] ? nf_hook_slow+0x73/0x111
[362409.430330]  [] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[362409.430342]  [] ? try_to_wake_up+0x199/0x199
[362409.430358]  [] ? NF_HOOK.clone.4+0x56/0x56 [bridge]
[362409.430375]  [] ? NF_HOOK.clone.4+0x3c/0x56 [bridge]
[362409.430392]  [] ? br_handle_frame+0x1af/0x1c6 [bridge]
[362409.430408]  [] ? br_handle_frame_finish+0x1f3/0x1f3 
[bridge]
[362409.430420]  [] ? __netif_receive_skb+0x2c4/0x40b
[362409.430432]  [] ? process_backlog+0x78/0x157
[362409.430443]  [] ? net_rx_action+0xa4/0x1b2
[362409.430454]  [] ? test_tsk_need_resched+0xe/0x17
[362409.430465]  [] ? __do_softirq+0xb9/0x178
[362409.430476]  [] ? call_softirq+0x1c/0x30
[362409.430481][] ? do_softirq+0x3f/0x84
[362409.430498]  [] ? netif_rx_ni+0x1e/0x27
[362409.430509]  [] ? tun_get_user+0x390/0x3b8 [tun]
[362409.430520]  [] ? bit_waitqueue+0x71/0xa4
[362409.430529]  [] ? _flat_send_IPI_mask+0x6a/0x7c
[362409.430541]  [] ? tun_get_socket+0x3b/0x3b [tun]
[362409.430552]  [] ? tun_chr_aio_write+0x5e/0x79 [tun]
[362409.430563]  [] ? do_sync_readv_writev+0x9a/0xd5
[362409.430574]  [] ? need_resched+0x1a/0x23
[362409.430585]  [] ? _cond_resched+0x9/0x20
[362409.430596]  [] ? copy_from_user+0x18/0x30
[362409.430608]  [] ? security_file_permission+0x18/0x33
[362409.430618]  [] ? do_readv_writev+0xa4/0x11a
[362409.430627]  [] ? fput+0x1a/0x1a2
[362409.430636]  [] ? sys_writev+0x45/0x90
[362409.430647]  [] ? system_call_fastpath+0x16/0x1b
[362409.430654] Mem-Info:
[362409.430659] Node 0 DMA per-cpu:
[362409.430667] CPU0: hi:0, btch:   1 usd:   0
[362409.430673] CPU1: hi:0, btch:   1 usd:   0
[362409.430679] CPU2: hi:0, btch:   1 usd:   0
[362409.430686] CPU3: hi:0, btch:   1 usd:   0
[362409.430691] Node 0 DMA32 per-cpu:
[362409.430699] CPU0: hi:  186, btch:  31 usd: 147
[362409.430705] CPU1: hi:  186, btch:  31 usd:  82
[362409.430712] CPU2: hi:  186, btch:  31 usd: 171
[362409.430718] CPU3: hi:  186, btch:  31 usd:  86
[362409.430724] Node 0 Normal per-cpu:
[362409.430730] CPU0: hi:  186, btch:  31 usd: 157
[362409.430737] CPU1: hi:  186, btch:  31 usd:  96
[362409.430743] CPU2: hi:  186, btch:  31 usd: 163
[362409.430749] CPU3: hi:  186, btch:  31 usd: 173
[362409.430764] active_anon:576670 inactive_anon:110462 isolated_anon:0
[362409.430769]  active_file:615766 inactive_file:638287 isolated_file:0
[362409.430774]  unevictable:0 dirty:3203 writeback:0 unstable:0
[362409.430778]  free:31113 slab_reclaimable:36977 slab_unreclaimable:11009
[362409.430783]  mapped:11738 shmem:226 pagetables:9104 bounce:0
[362409.430791] Node 0 DMA free:15912kB min:128kB low:160kB high:192kB 
active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB 
unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15688kB 
mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB 
slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB 
writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
[362409.430827] lowmem_reserve[]: 0 3254 8051 8051
[362409.430837] Node 0 DMA32 free:57588kB min:27260kB low:34072kB high:40888kB 
active_anon:454620kB inactive_anon:92920kB active_file:1279680kB 
inactive_file:1348708kB unevictable:0kB isolated(anon):0kB isolated(file):0kB 
present:3332192kB mlock

Re: How many threads should a kvm vm be starting?

2011-09-27 Thread Thomas Fjellstrom

On September 27, 2011, Avi Kivity wrote:
> On 09/27/2011 03:29 AM, Thomas Fjellstrom wrote:
> > I just noticed something interesting, a virtual machine on one of my
> > servers seems to have 69 threads (including the main thread). Other
> > guests on the machine only have a couple threads.
> > 
> > Is this normal? or has something gone horribly wrong?
> 
> It's normal if the guest does a lot of I/O.  The thread count should go
> down when the guest idles.

Ah, that would make sense. Though it kind of defeats assigning a vm a single 
cpu/core. A single VM can now DOS an entire multi-core-cpu server. It pretty 
much pegged my dual core (with HT) server for a couple hours.

-- 
Thomas Fjellstrom
tho...@fjellstrom.ca
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] RFC [v2]: vfio / device assignment -- layout of device fd files

2011-09-27 Thread Scott Wood

On 09/26/2011 07:45 PM, Alex Williamson wrote:
> On Mon, 2011-09-26 at 18:59 -0500, Scott Wood wrote:
>> On 09/26/2011 01:34 PM, Alex Williamson wrote:
>>> /* Reset the device */
>>> #define VFIO_DEVICE_RESET   _IO(, ,)
>>
>> What generic way do we have to do this?  We should probably have a way
>> to determine whether it's possible, without actually asking to do it.
> 
> It's not generic, it could be a VFIO_DEVICE_PCI_RESET or we could add a
> bit to the device flags to indicate if it's available or we could add a
> "probe" arg to the ioctl to either check for existence or do it.

Even with PCI, isn't this only possible if function-level reset is
supported?  I think we need a flag.

For devices that can't be reset by the kernel, we'll want the ability to
stop/start DMA acccess through the IOMMU (or other bus-specific means),
separate from whether the fd is open.  If a device is assigned to a
partition and that partition gets reset, we'll want to disable DMA
before we re-use the memory, and enable it after the partition has reset
or quiesced the device (which requires the fd to be open).

>>> /* PCI MSI setup, arg[0] = #, arg[1-n] = eventfds */
>>> #define VFIO_DEVICE_PCI_SET_MSI_EVENTFDS_IOW(, , int)
>>> #define VFIO_DEVICE_PCI_SET_MSIX_EVENTFDS   _IOW(, , int)
>>>
>>> Hope that covers it.
>>
>> It could be done this way, but I predict that the code (both kernel and
>> user side) will be larger.  Maybe not much more complex, but more
>> boilerplate.
>>
>> How will you manage extensions to the interface?
> 
> I would assume we'd do something similar to the kvm capabilities checks.

This information is already built into the data-structure approach.

>> The table should not be particularly large, and you'll need to keep the
>> information around in some form regardless.  Maybe in the PCI case you
>> could produce it dynamically (though I probably wouldn't), but it really
>> wouldn't make sense in the device tree case.
> 
> It would be entirely dynamic for PCI, there's no advantage to caching
> it.  Even for device tree, if you can't fetch it dynamically, you'd have
> to duplicate it between an internal data structure and a buffer reading
> the table.

I don't think we'd need to keep the device tree path/index info around
for anything but the table -- but really, this is a minor consideration.

>> You also lose the ability to easily have a human look at the hexdump for
>> debugging; you'll need a special "lsvfio" tool.  You might want one
>> anyway to pretty-print the info, but with ioctls it's mandatory.
> 
> I don't think this alone justifies duplicating the data and making it
> difficult to parse on both ends.  Chances are we won't need such a tool
> for the ioctl interface because it's easier to get it right the first
> time ;)

It's not just useful for getting the code right, but for e.g. sanity
checking that the devices were bound properly.  I think such a tool
would be generally useful, no matter what the kernel interface ends up
being.  I don't just use lspci to debug the PCI subsystem. :-)

> Note that I'm not stuck on this interface, I was just thinking about how
> to generate the table last week, it seemed like a pain so I thought I'd
> spend a few minutes outlining an ioctl interface... turns out it's not
> so bad.  Thanks,

Yeah, it can work either way, as long as the information's there and
there's a way to add new bits of information, or new bus types, down the
road.  Mainly a matter of aesthetics between the two.

-Scott

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM-autotest][PATCH] cgroup test with KVM guest +first subtests

2011-09-27 Thread Lucas Meneghel Rodrigues


On 09/22/2011 01:29 PM, Lukas Doktor wrote:

Hi guys,

Do you remember the discussion about cgroup testing in autotest vs. LTP? I hope 
there won't be any doubts about this one as ground_test (+ first 2 subtests) 
are strictly focused on cgroups features enforced on KVM guest systems. Also 
more subtests will follow if you approve the test structure (blkio_throttle, 
memory, cpus...).

No matter whether we drop or keep the general 'cgroup' test. The 
'cgroup_common.py' library can be imported either from 'client/tests/cgroup/' 
directory or directly from 'client/tests/kvm/tests/' directory.

The modifications of 'cgroup_common.py' library is backward compatible with 
general cgroup test.

See the commits for details.



Ok, after quite a bit of back and forth, pull request merged, thanks Lukas!


Regards,
Lukáš Doktor



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-27 Thread Roedel, Joerg

On Tue, Sep 27, 2011 at 09:28:37AM -0400, Ohad Ben-Cohen wrote:
> So you're suggesting to re-implement find_next_bit() using ffs()/fls()
> and shifting ?

No. I suggest a simpler and shorter algorithm using the bit helpers.
Something like that:

min_idx = __ffs(iommu_page_sizes);

while (size) {
/* Max alignment allowed by current physical address */
phys_idx = __ffs(phys);

/* Max alignment allowed by current size */
size_idx = __fls(size);

/* special case: iova == 0 */
if (likely(phys))
idx = min(phys_idx, size_idx);
else
idx = size_idx;

BUG_ON(idx < min_idx);

psize = 1UL << idx;

/* search next smaller page-size supported */
while (psize && !(iommu_page_sizes & psize))
psize >>= 1;

BUG_ON(psize == 0);

iommu_ops->map(domain, iova, phys, get_order(psize), prot);

iova += psize;
phys += psize;
size -= psize;
}

It is only C-style pseudo-code, of course. These __ffs and __fls lines
all translate to a single instruction later. The find_next_bit()
function has a lot more overhead because it needs to take account of
real bitmaps (arrays of ulong). But this complexity is not required
here.

And yes, overhead is important when we implement the generic dma-ops
on-top of the iommu-api because this will make the iommu_map function a
fast-path. So we really care about overhead here.

Joerg

-- 
AMD Operating System Research Center

Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Qemu/KVM is 3x slower under libvirt

2011-09-27 Thread Reeted


I repost this, this time by also including the libvirt mailing list.

Info on my libvirt: it's the version in Ubuntu 11.04 Natty which is 
0.8.8-1ubuntu6.5 . I didn't recompile this one, while Kernel and 
qemu-kvm are vanilla and compiled by hand as described below.


My original message follows:

This is really strange.

I just installed a new host with kernel 3.0.3 and Qemu-KVM 0.14.1 
compiled by me.


I have created the first VM.
This is on LVM, virtio etc... if I run it directly from bash console, it 
boots in 8 seconds (it's a bare ubuntu with no graphics), while if I 
boot it under virsh (libvirt) it boots in 20-22 seconds. This is the 
time from after Grub to the login prompt, or from after Grub to the 
ssh-server up.


I was almost able to replicate the whole libvirt command line on the 
bash console, and it still goes almost 3x faster when launched from bash 
than with virsh start vmname. The part I wasn't able to replicate is the 
-netdev part because I still haven't understood the semantics of it.


This is my bash commandline:

/opt/qemu-kvm-0.14.1/bin/qemu-system-x86_64 -M pc-0.14 -enable-kvm -m 
2002 -smp 2,sockets=2,cores=1,threads=1 -name vmname1-1 -uuid 
ee75e28a-3bf3-78d9-3cba-65aa63973380 -nodefconfig -nodefaults -chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/vmname1-1.monitor,server,nowait 
-mon chardev=charmonitor,id=monitor,mode=readline -rtc base=utc -boot 
order=dc,menu=on -drive 
file=/dev/mapper/vgPtpVM-lvVM_Vmname1_d1,if=none,id=drive-virtio-disk0,boot=on,format=raw,cache=none,aio=native 
-device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
-drive 
if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw,cache=none,aio=native 
-device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -net 
nic,model=virtio -net tap,ifname=tap0,script=no,downscript=no  -usb -vnc 
127.0.0.1:0 -vga cirrus -device 
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5



Which was taken from libvirt's command line. The only modifications I 
did to the original libvirt commandline (seen with ps aux) were:


- Removed -S

- Network was: -netdev tap,fd=17,id=hostnet0,vhost=on,vhostfd=18 -device 
virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:05:36:60,bus=pci.0,addr=0x3
Has been simplified to: -net nic,model=virtio -net 
tap,ifname=tap0,script=no,downscript=no

and manual bridging of the tap0 interface.


Firstly I had thought that this could be fault of the VNC: I have 
compiled qemu-kvm with no separate vnc thread. I thought that libvirt 
might have connected to the vnc server at all times and this could have 
slowed down the whole VM.
But then I also tried connecting vith vncviewer to the KVM machine 
launched directly from bash, and the speed of it didn't change. So no, 
it doesn't seem to be that.


BTW: is the slowdown of the VM on "no separate vnc thread" only in 
effect when somebody is actually connected to VNC, or always?


Also, note that the time difference is not visible in dmesg once the 
machine has booted. So it's not a slowdown in detecting devices. Devices 
are always detected within the first 3 seconds, according to dmesg, at 
3.6 seconds the first ext4 mount begins. It seems to be really the OS 
boot that is slow... it seems an hard disk performance problem.


Thank you
R.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] KVM test: unattended_install: Remove unused variable images_dir

2011-09-27 Thread Lucas Meneghel Rodrigues

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/tests/unattended_install.py |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/tests/unattended_install.py 
b/client/tests/kvm/tests/unattended_install.py
index 7d90a91..7655bc1 100644
--- a/client/tests/kvm/tests/unattended_install.py
+++ b/client/tests/kvm/tests/unattended_install.py
@@ -211,7 +211,6 @@ class UnattendedInstallConfig(object):
 @param params: Dictionary with test parameters.
 """
 root_dir = test.bindir
-images_dir = os.path.join(root_dir, 'images')
 self.deps_dir = os.path.join(root_dir, 'deps')
 self.unattended_dir = os.path.join(root_dir, 'unattended')
 
-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] KVM test: unattended_install: Fix the background migration code

2011-09-27 Thread Lucas Meneghel Rodrigues

Currently we use initrd and kernel params to perform
unattended install in all supported linux guests. Before
that, we used to remove those params from the qemu params
because they alledgedly 'broke' the migration. Well, turns
out this code renders our migrate in background code path
useless, and it does not appear to break migration on the
tests I've made. So, let's just get rid of it and be happy.

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/tests/unattended_install.py |   10 --
 1 files changed, 0 insertions(+), 10 deletions(-)

diff --git a/client/tests/kvm/tests/unattended_install.py 
b/client/tests/kvm/tests/unattended_install.py
index 023d00f..b1d23f6 100644
--- a/client/tests/kvm/tests/unattended_install.py
+++ b/client/tests/kvm/tests/unattended_install.py
@@ -611,16 +611,6 @@ def run_unattended_install(test, params, env):
 pass
 
 if migrate_background:
-# Drop the params which may break the migration
-# Better method is to use dnsmasq to do the
-# unattended installation
-if vm.params.get("initrd"):
-vm.params["initrd"] = None
-if vm.params.get("kernel"):
-vm.params["kernel"] = None
-if vm.params.get("extra_params"):
-vm.params["extra_params"] = re.sub("--append '.*'", "",
-   vm.params["extra_params"])
 vm.migrate(timeout=mig_timeout, protocol=mig_protocol)
 else:
 time.sleep(1)
-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/4] KVM test: unattended_install: Remove unused variable post_install_delay

2011-09-27 Thread Lucas Meneghel Rodrigues

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/tests/unattended_install.py |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/tests/unattended_install.py 
b/client/tests/kvm/tests/unattended_install.py
index 7655bc1..023d00f 100644
--- a/client/tests/kvm/tests/unattended_install.py
+++ b/client/tests/kvm/tests/unattended_install.py
@@ -580,7 +580,6 @@ def run_unattended_install(test, params, env):
 vm.create()
 
 install_timeout = int(params.get("timeout", 3000))
-post_install_delay = int(params.get("post_install_delay", 0))
 port = vm.get_port(int(params.get("guest_port_unattended_install")))
 
 migrate_background = params.get("migrate_background") == "yes"
-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] KVM test: unattended_install: Fix unused variable warning

2011-09-27 Thread Lucas Meneghel Rodrigues

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/tests/unattended_install.py |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/tests/unattended_install.py 
b/client/tests/kvm/tests/unattended_install.py
index 1ff73af..7d90a91 100644
--- a/client/tests/kvm/tests/unattended_install.py
+++ b/client/tests/kvm/tests/unattended_install.py
@@ -91,6 +91,7 @@ class FloppyDisk(Disk):
 m_cmd = 'mount -o loop,rw %s %s' % (path, self.mount)
 utils.run(m_cmd)
 except error.CmdError, e:
+logging.error("Error during floppy initialization: %s" % e)
 cleanup(self.mount)
 raise
 
-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4] KVM test: Unattended install fixes

2011-09-27 Thread Lucas Meneghel Rodrigues

Here we have a handful of fixes for the unattended install code.
In special, with the last patch we recover the ability to run ping
pong migration in the background during the unattended install for
linux guests.

This patchset was also sent as a pull request on gitub. Feel free
to make comments there:

https://github.com/autotest/autotest/pull/24

Lucas Meneghel Rodrigues (4):
  KVM test: unattended_install: Fix unused variable warning
  KVM test: unattended_install: Remove unused variable images_dir
  KVM test: unattended_install: Remove unused variable
post_install_delay
  KVM test: unattended_install: Fix the background migration code

 client/tests/kvm/tests/unattended_install.py |   13 +
 1 files changed, 1 insertions(+), 12 deletions(-)

-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks

2011-09-27 Thread Jeremy Fitzhardinge

On 09/27/2011 02:34 AM, Stephan Diestelhorst wrote:
> On Wednesday 14 September 2011, 17:31:32 Jeremy Fitzhardinge wrote:
>> This series replaces the existing paravirtualized spinlock mechanism
>> with a paravirtualized ticketlock mechanism.
> [...] 
>> The unlock code is very straightforward:
>>  prev = *lock;
>>  __ticket_unlock_release(lock);
>>  if (unlikely(__ticket_in_slowpath(lock)))
>>  __ticket_unlock_slowpath(lock, prev);
>>
>> which generates:
>>  push   %rbp
>>  mov%rsp,%rbp
>>
>> movzwl (%rdi),%esi
>>  addb   $0x2,(%rdi)
>> movzwl (%rdi),%eax
>>  testb  $0x1,%ah
>>  jne1f
>>
>>  pop%rbp
>>  retq   
>>
>>  ### SLOWPATH START
>> 1:   movzwl (%rdi),%edx
>>  movzbl %dh,%ecx
>>  mov%edx,%eax
>>  and$-2,%ecx # clear TICKET_SLOWPATH_FLAG
>>  mov%cl,%dh
>>  cmp%dl,%cl  # test to see if lock is uncontended
>>  je 3f
>>
>> 2:   movzbl %dl,%esi
>>  callq  *__ticket_unlock_kick# kick anyone waiting
>>  pop%rbp
>>  retq   
>>
>> 3:   lock cmpxchg %dx,(%rdi) # use cmpxchg to safely write back flag
>>  jmp2b
>>  ### SLOWPATH END
> [...]
>> Thoughts? Comments? Suggestions?
> You have a nasty data race in your code that can cause a losing
> acquirer to sleep forever, because its setting the TICKET_SLOWPATH flag
> can race with the lock holder releasing the lock.
>
> I used the code for the slow path from the GIT repo.
>
> Let me try to point out an interleaving:
>
> Lock is held by one thread, contains 0x0200.
>
> _Lock holder_   _Acquirer_
> mov$0x200,%eax
> lock xadd %ax,(%rdi)
> // ax:= 0x0200, lock:= 0x0400
> ...
> // this guy spins for a while, reading
> // the lock
> ...
> //trying to free the lock
> movzwl (%rdi),%esi (esi:=0x0400)
> addb   $0x2,(%rdi) (LOCAL copy of lock is now: 0x0402)
> movzwl (%rdi),%eax (local forwarding from previous store: eax := 0x0402)
> testb  $0x1,%ah(no wakeup of anybody)
> jne1f
>
> callq  *__ticket_lock_spinning
>   ...
>   // __ticket_enter_slowpath(lock)
>   lock or (%rdi), $0x100
>   // (global view of lock := 0x0500)
>   ...
>   ACCESS_ONCE(lock->tickets.head) == want
>   // (reads 0x00)
>   ...
>   xen_poll_irq(irq); // goes to sleep
> ...
> [addb   $0x2,(%rdi)]
> // (becomes globally visible only now! global view of lock := 0x0502)
> ...
>
> Your code is reusing the (just about) safe version of unlocking a
> spinlock without understanding the effect that close has on later
> memory ordering. It may work on CPUs that cannot do narrow -> wide
> store to load forwarding and have to make the addb store visible
> globally. This is an implementation artifact of specific uarches, and
> you mustn't rely on it, since our specified memory model allows looser
> behaviour.

Ah, thanks for this observation.  I've seen this bug before when I
didn't pay attention to the unlock W vs flag R ordering at all, and I
was hoping the aliasing would be sufficient - and certainly this seems
to have been OK on my Intel systems.  But you're saying that it will
fail on current AMD systems?  Have you tested this, or is this just from
code analysis (which I agree with after reviewing the ordering rules in
the Intel manual).

> Since you want to get that addb out to global memory before the second
> read, either use a LOCK prefix for it, add an MFENCE between addb and
> movzwl, or use a LOCKed instruction that will have a fencing effect
> (e.g., to top-of-stack)between addb and movzwl.

Hm.  I don't really want to do any of those because it will probably
have a significant effect on the unlock performance; I was really trying
to avoid adding any more locked instructions.  A previous version of the
code had an mfence in here, but I hit on the idea of using aliasing to
get the ordering I want - but overlooked the possible effect of store
forwarding.

I guess it comes down to throwing myself on the efficiency of some kind
of fence instruction.  I guess an lfence would be sufficient; is that
any more efficient than a full mfence?  At least I can make it so that
its only present when pv ticket locks are actually in use, so it won't
affect the native case.

Could you give me a pointer to AMD's description of the ordering rules?

Thanks,
J
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majo

Re: [PATCH 0/4] acpi: fix up EJ0 in DSDT

2011-09-27 Thread Paolo Bonzini


On 09/27/2011 03:04 PM, Michael S. Tsirkin wrote:

>There was a concern raised with
>  two-pass PCI initialization that I need to follow up on before
>  tagging.

The isa bridge? I thought that got fixed ...


Daniel Berrange reported a regression with virtio-9p.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: kvm linux-next tree

2011-09-27 Thread Stephen Rothwell

Hi Avi,

On Tue, 27 Sep 2011 15:25:02 +0300 Avi Kivity  wrote:
>
> Please add
> 
>git://github.com/avikivity/kvm.git kvm-updates/3.2
> 
> to the linux-next collection, until kernel.org is back up.  Thanks.

I will switch to this tomorrow.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/


pgpsSrtlYk2CI.pgp
Description: PGP signature

Re: KVM call agenda for September 26

2011-09-27 Thread Juan Quintela

Juan Quintela  wrote:
> Hi
>
> Please send in any agenda items you are interested in covering.
>
> Thanks, Juan.

As there are no agenda for today, call gets cancelled.

Happy hacking, Juan.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-27 Thread Ohad Ben-Cohen

On Tue, Sep 27, 2011 at 4:12 PM, Roedel, Joerg  wrote:
> You pass a pointer to an unsigned long for the page-size bitmap. This
> allows to use an array of unsigned long. But a single unsigned long is
> sufficient

This is fine; I can change that if you like it better (though without
doing the change below this is probably moot).

> and you can use functions like ffs() and fls() together with
> shifting. These functions often translate to a single intruction in the
> binary. The find_next_bit function has much more overhead because it
> needs to handle the array-of-ulong case.

So you're suggesting to re-implement find_next_bit() using ffs()/fls()
and shifting ?

What's the point ?

Sure, if we'll have a proven performance issue while using
find_next_bit() we can think of doing this, but at this stage, this
sounds to me like a premature optimization which isn't too elegant.

At this point I strongly prefer readable, maintainable and easier to
debug code over optimization which isn't proven to be necessary.

Thanks,
Ohad.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-27 Thread Roedel, Joerg

On Tue, Sep 27, 2011 at 08:26:29AM -0400, Ohad Ben-Cohen wrote:
> > With an unsigned long you can use plain and fast bit_ops instead of the
> > full bitmap functions.
> 
> Not sure I follow; the only bit operation I'm using while mapping is
> find_next_bit() (which is a bitops.h method).
> 
> What other faster variant are you referring to ?

You pass a pointer to an unsigned long for the page-size bitmap. This
allows to use an array of unsigned long. But a single unsigned long is
sufficient and you can use functions like ffs() and fls() together with
shifting. These functions often translate to a single intruction in the
binary. The find_next_bit function has much more overhead because it
needs to handle the array-of-ulong case.

Joerg

-- 
AMD Operating System Research Center

Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] kvm tools: Set active console before running /bin/sh

2011-09-27 Thread Sasha Levin

This patch sets active console to the serial console before running the
shell. Doing so fixes two issues:

* Fix job control.

* Set all env vars.

The user visible issues are less warnings (no more of this:
  sh: cannot set terminal process group (-1): Inappropriate ioctl for device
  sh: no job control in this shell)

A working 'top', and a working ctrl-c.

Signed-off-by: Sasha Levin 
---
 tools/kvm/guest/init.c |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
index 837acfb..4fc6aaf 100644
--- a/tools/kvm/guest/init.c
+++ b/tools/kvm/guest/init.c
@@ -11,7 +11,7 @@
 static int run_process(char *filename)
 {
char *new_argv[] = { filename, NULL };
-   char *new_env[] = { NULL };
+   char *new_env[] = { "TERM=linux" };
 
return execve(filename, new_argv, new_env);
 }
@@ -30,6 +30,12 @@ int main(int argc, char *argv[])
 
do_mounts();
 
+/* get session leader */
+setsid();
+
+/* set controlling terminal */
+ioctl (0, TIOCSCTTY, 1);
+
puts("Starting '/bin/sh'...");
 
run_process("/bin/sh");
-- 
1.7.6.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/4] acpi: fix up EJ0 in DSDT

2011-09-27 Thread Michael S. Tsirkin

On Mon, Sep 26, 2011 at 08:04:24PM -0400, Kevin O'Connor wrote:
> On Mon, Sep 26, 2011 at 10:04:13AM +0300, Michael S. Tsirkin wrote:
> > On Mon, Sep 26, 2011 at 12:40:18AM -0400, Kevin O'Connor wrote:
> > > On Thu, Sep 22, 2011 at 09:09:49AM +0300, Michael S. Tsirkin wrote:
> > > > On Thu, Sep 22, 2011 at 12:35:13AM -0400, Kevin O'Connor wrote:
> > > > > The code to generate basic SSDT code isn't that difficult (see
> > > > > build_ssdt and src/ssdt-proc.dsl).  Is there a compelling reason to
> > > > > patch the DSDT versus just generating the necessary blocks in an SSDT?
> > > > 
> > > > I don't really care whether the code is in DSDT or SSDT,
> > > > IMO there isn't much difference between build_ssdt and patching:
> > > > main reason is build_ssdt uses offsets hardcoded to a specific binary
> > > > (ssdt_proc and SD_OFFSET_* ) while I used
> > > > a script to extract offsets.
> > > 
> > > Yes - your script to extract the offsets is nice.
> > 
> > If you still have doubts,
> > it might make sense to merge just patch 1 -
> > acpi: generate and parse mixed asl/aml listing
> > - as the first step.
> > With the infrastructure in place it will be
> > easier to discuss the best way to use it.
> 
> I'm okay with your first patch.

BTW, any more comments with the rest of the patchset?
If you just need to think about it, I understand.

> However, I wish to tag a release
> before committing ACPI changes.

Sure. So you'll take this patchset from here or want me to
ping you later?

>  There was a concern raised with
> two-pass PCI initialization that I need to follow up on before
> tagging.

The isa bridge? I thought that got fixed ...

> 
> -Kevin

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm tools: Set active console before running /bin/sh

2011-09-27 Thread Sasha Levin

This patch sets active console to the serial console before running the
shell. Doing so fixes two issues:

* Fix job control.

* Set all env vars.

The user visible issues are less warnings (no more of this:
  sh: cannot set terminal process group (-1): Inappropriate ioctl for device
  sh: no job control in this shell)

A working 'top', and a working ctrl-c.

Signed-off-by: Sasha Levin 
---
 tools/kvm/guest/init.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c
index 837acfb..b55cd30 100644
--- a/tools/kvm/guest/init.c
+++ b/tools/kvm/guest/init.c
@@ -30,6 +30,12 @@ int main(int argc, char *argv[])
 
do_mounts();
 
+/* get session leader */
+setsid();
+
+/* set controlling terminal */
+ioctl (0, TIOCSCTTY, 1);
+
puts("Starting '/bin/sh'...");
 
run_process("/bin/sh");
-- 
1.7.6.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb

2011-09-27 Thread Michael S. Tsirkin

On Tue, Sep 27, 2011 at 02:20:29PM +0300, Sasha Levin wrote:
> On Tue, 2011-09-27 at 10:00 +0300, Michael S. Tsirkin wrote:
> > >   skb = page_to_skb(vi, page, len);
> > >   ...
> > 
> > Sorry I don't get it yet. Where is mergeable ignored here?
> 
> The NULL deref happens in page_to_skb(), before merge buffers are
> handled.

The len here is a single buffer length, so for mergeable
buffers it is <= the size of the buffer we gave to hardware,
which is PAGE_SIZE.

err = virtqueue_add_buf_single(vi->rvq, page_address(page),
   PAGE_SIZE, false, page, gfp);

Unless of course we are trying to work around broken hardware again,
which I don't have a problem with, but should probably
get appropriate comments in code and trigger a warning.

> I'll test it and see if it's really the case.
> 
> -- 
> 
> Sasha.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb

2011-09-27 Thread Sasha Levin

On Tue, 2011-09-27 at 10:00 +0300, Michael S. Tsirkin wrote:
> >   skb = page_to_skb(vi, page, len);
> >   ...
> 
> Sorry I don't get it yet. Where is mergeable ignored here?

The NULL deref happens in page_to_skb(), before merge buffers are
handled.

I'll test it and see if it's really the case.

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-27 Thread Ohad Ben-Cohen

On Tue, Sep 27, 2011 at 1:05 PM, Roedel, Joerg  wrote:
> On Fri, Sep 16, 2011 at 01:51:41PM -0400, Ohad Ben-Cohen wrote:
>>  int iommu_map(struct iommu_domain *domain, unsigned long iova,
>> -             phys_addr_t paddr, int gfp_order, int prot)
>> +             phys_addr_t paddr, size_t size, int prot)
>>  {
>> -       size_t size;
>> +       int ret = 0;
>> +
>> +       /*
>> +        * both the virtual address and the physical one, as well as
>> +        * the size of the mapping, must be aligned (at least) to the
>> +        * size of the smallest page supported by the hardware
>> +        */
>> +       if (!IS_ALIGNED(iova | paddr | size, iommu_min_pagesz)) {
>> +               pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz 
>> "
>> +                       "0x%x\n", iova, (unsigned long)paddr,
>> +                       (unsigned long)size, iommu_min_pagesz);
>> +               return -EINVAL;
>> +       }
>> +
>> +       pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova,
>> +                               (unsigned long)paddr, (unsigned long)size);
>> +
>> +       while (size) {
>> +               unsigned long pgsize = iommu_min_pagesz;
>> +               unsigned long idx = iommu_min_page_idx;
>> +               unsigned long addr_merge = iova | paddr;
>> +               int order;
>> +
>> +               /* find the max page size with which iova, paddr are aligned 
>> */
>> +               for (;;) {
>> +                       unsigned long try_pgsize;
>> +
>> +                       idx = find_next_bit(iommu_pgsize_bitmap,
>> +                                               iommu_nr_page_bits, idx + 1);
>> +
>> +                       /* no more pages to check ? */
>> +                       if (idx >= iommu_nr_page_bits)
>> +                               break;
>> +
>> +                       try_pgsize = 1 << idx;
>>
>> -       size         = 0x1000UL << gfp_order;
>> +                       /* page too big ? addresses not aligned ? */
>> +                       if (size < try_pgsize ||
>> +                                       !IS_ALIGNED(addr_merge, try_pgsize))
>> +                               break;
>>
>> -       BUG_ON(!IS_ALIGNED(iova | paddr, size));
>> +                       pgsize = try_pgsize;
>> +               }
>
> With an unsigned long you can use plain and fast bit_ops instead of the
> full bitmap functions.

Not sure I follow; the only bit operation I'm using while mapping is
find_next_bit() (which is a bitops.h method).

What other faster variant are you referring to ?

Thanks,
Ohad.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

kvm linux-next tree

2011-09-27 Thread Avi Kivity


Please add

  git://github.com/avikivity/kvm.git kvm-updates/3.2

to the linux-next collection, until kernel.org is back up.  Thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware

2011-09-27 Thread Roedel, Joerg

On Fri, Sep 16, 2011 at 01:51:41PM -0400, Ohad Ben-Cohen wrote:
>  drivers/iommu/iommu.c  |  158 
> +---
>  drivers/iommu/omap-iovmm.c |   12 +---
>  include/linux/iommu.h  |6 +-
>  virt/kvm/iommu.c   |4 +-
>  4 files changed, 157 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index c68ff29..7c01c8c 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -16,6 +16,8 @@
>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
>   */
> 
> +#define pr_fmt(fmt)"%s: " fmt, __func__
> +
>  #include 
>  #include 
>  #include 
> @@ -23,15 +25,73 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  static struct iommu_ops *iommu_ops;
> 
> +/* bitmap of supported page sizes */
> +static unsigned long *iommu_pgsize_bitmap;
> +
> +/* number of bits used to represent the supported pages */
> +static unsigned int iommu_nr_page_bits;
> +
> +/* size of the smallest supported page (in bytes) */
> +static unsigned int iommu_min_pagesz;
> +
> +/* bit number of the smallest supported page */
> +static unsigned int iommu_min_page_idx;
> +
> +/**
> + * register_iommu() - register an IOMMU hardware
> + * @ops: iommu handlers
> + * @pgsize_bitmap: bitmap of page sizes supported by the hardware
> + * @nr_page_bits: size of @pgsize_bitmap (in bits)
> + *
> + * Note: this is a temporary function, which will be removed once
> + * all IOMMU drivers are converted. The only reason it exists is to
> + * allow splitting the pgsizes changes to several patches in order to ease
> + * the review.
> + */
> +void register_iommu_pgsize(struct iommu_ops *ops, unsigned long 
> *pgsize_bitmap,
> +   unsigned int nr_page_bits)

I think a plain unsigned long value is sufficient to encode the
supported page-sizes. No need to use a full bitmap.

>  int iommu_map(struct iommu_domain *domain, unsigned long iova,
> - phys_addr_t paddr, int gfp_order, int prot)
> + phys_addr_t paddr, size_t size, int prot)
>  {
> -   size_t size;
> +   int ret = 0;
> +
> +   /*
> +* both the virtual address and the physical one, as well as
> +* the size of the mapping, must be aligned (at least) to the
> +* size of the smallest page supported by the hardware
> +*/
> +   if (!IS_ALIGNED(iova | paddr | size, iommu_min_pagesz)) {
> +   pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz "
> +   "0x%x\n", iova, (unsigned long)paddr,
> +   (unsigned long)size, iommu_min_pagesz);
> +   return -EINVAL;
> +   }
> +
> +   pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova,
> +   (unsigned long)paddr, (unsigned long)size);
> +
> +   while (size) {
> +   unsigned long pgsize = iommu_min_pagesz;
> +   unsigned long idx = iommu_min_page_idx;
> +   unsigned long addr_merge = iova | paddr;
> +   int order;
> +
> +   /* find the max page size with which iova, paddr are aligned 
> */
> +   for (;;) {
> +   unsigned long try_pgsize;
> +
> +   idx = find_next_bit(iommu_pgsize_bitmap,
> +   iommu_nr_page_bits, idx + 1);
> +
> +   /* no more pages to check ? */
> +   if (idx >= iommu_nr_page_bits)
> +   break;
> +
> +   try_pgsize = 1 << idx;
> 
> -   size = 0x1000UL << gfp_order;
> +   /* page too big ? addresses not aligned ? */
> +   if (size < try_pgsize ||
> +   !IS_ALIGNED(addr_merge, try_pgsize))
> +   break;
> 
> -   BUG_ON(!IS_ALIGNED(iova | paddr, size));
> +   pgsize = try_pgsize;
> +   }

With an unsigned long you can use plain and fast bit_ops instead of the
full bitmap functions. 

> 
> -   return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
> +   order = get_order(pgsize);
> +
> +   pr_debug("mapping: iova 0x%lx pa 0x%lx order %d\n", iova,
> +   (unsigned long)paddr, order);
> +
> +   ret = iommu_ops->map(domain, iova, paddr, order, prot);
> +   if (ret)
> +   break;
> +
> +   size -= pgsize;
> +   iova += pgsize;
> +   paddr += pgsize;
> +   }
> +
> +   return ret;
>  }
>  EXPORT_SYMBOL_GPL(iommu_map);
> 

-- 
AMD Operating System Research Center

Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach
General Managers: Alberto Bozzo, Andrew Bowd
Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632

--
To unsubscribe from t

Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work

2011-09-27 Thread Avi Kivity


On 09/27/2011 12:00 PM, Robin Lee Powell wrote:

On Tue, Sep 27, 2011 at 01:48:43AM -0700, Robin Lee Powell wrote:
>  On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote:
>  >  On 9/27/11, Robin Lee Powell  wrote:
>  >  >  On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin
>  >  >  wrote:
>  >  >>  It's unrelated to what you're actually using as the disks,
>  >  >>  whether file or block devices like LVs. I think it just makes
>  >  >>  KVM tell the host not to cache I/O done on the storage device.
>  >  >
>  >  >  Wait, hold on, I think I had it backwards.
>  >  >
>  >  >  It tells the *host* to not cache the device in question, or the
>  >  >  *VMs* to not cache the device in question?
>  >
>  >  I'm fairly certain it tells the qemu not to cache the device in
>  >  question. If you don't want the guest to cache their i/o, then the
>  >  guest OS should be configured if it allows that. Although I'm not
>  >  sure if it's possible to disable disk buffering/caching system
>  >  wide in Linux.
>
>  OK, great, thanks.
>
>  Now if I could just figure out how to stop the host from swapping
>  out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM
>  left.  -_-  swappiness 0 doesn't seem to help there.

Grrr.

I turned swap off to clear it.  A few minutes ago, this host was at
zero swap:

top - 01:59:10 up 10 days, 15:17,  3 users,  load average: 6.39, 4.26, 3.24
Tasks: 151 total,   1 running, 150 sleeping,   0 stopped,   0 zombie
Cpu(s):  6.6%us,  1.0%sy,  0.0%ni, 85.9%id,  6.3%wa,  0.0%hi,  0.2%si,  0.0%st
Mem:   8128772k total,  656k used,  1617656k free,14800k buffers
Swap:  8388604k total,   672828k used,  7715776k free,97536k cached

   PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND
  2504 qemu  20   0 2425m 1.8g  448 S 10.0 23.4   3547:59 qemu-kvm
  2258 qemu  20   0 2425m 1.7g  444 S  2.7 21.7   1288:15 qemu-kvm
18061 qemu  20   0 2433m 1.8g  428 S  2.3 23.4 401:01.99 qemu-kvm
10335 qemu  20   0 1864m 861m  456 S  1.0 10.9   2:04.26 qemu-kvm
[snip]

Why is it doing this?!?  ;'(



Please post the contents of /proc/meminfo and /proc/zoneinfo when this 
is happening.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] [PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks

2011-09-27 Thread Stephan Diestelhorst

On Wednesday 14 September 2011, 17:31:32 Jeremy Fitzhardinge wrote:
> This series replaces the existing paravirtualized spinlock mechanism
> with a paravirtualized ticketlock mechanism.
[...] 
> The unlock code is very straightforward:
>   prev = *lock;
>   __ticket_unlock_release(lock);
>   if (unlikely(__ticket_in_slowpath(lock)))
>   __ticket_unlock_slowpath(lock, prev);
> 
> which generates:
>   push   %rbp
>   mov%rsp,%rbp
> 
> movzwl (%rdi),%esi
>   addb   $0x2,(%rdi)
> movzwl (%rdi),%eax
>   testb  $0x1,%ah
>   jne1f
> 
>   pop%rbp
>   retq   
> 
>   ### SLOWPATH START
> 1:movzwl (%rdi),%edx
>   movzbl %dh,%ecx
>   mov%edx,%eax
>   and$-2,%ecx # clear TICKET_SLOWPATH_FLAG
>   mov%cl,%dh
>   cmp%dl,%cl  # test to see if lock is uncontended
>   je 3f
> 
> 2:movzbl %dl,%esi
>   callq  *__ticket_unlock_kick# kick anyone waiting
>   pop%rbp
>   retq   
> 
> 3:lock cmpxchg %dx,(%rdi) # use cmpxchg to safely write back flag
>   jmp2b
>   ### SLOWPATH END
[...]
> Thoughts? Comments? Suggestions?

You have a nasty data race in your code that can cause a losing
acquirer to sleep forever, because its setting the TICKET_SLOWPATH flag
can race with the lock holder releasing the lock.

I used the code for the slow path from the GIT repo.

Let me try to point out an interleaving:

Lock is held by one thread, contains 0x0200.

_Lock holder_   _Acquirer_
mov$0x200,%eax
lock xadd %ax,(%rdi)
// ax:= 0x0200, lock:= 0x0400
...
// this guy spins for a while, reading
// the lock
...
//trying to free the lock
movzwl (%rdi),%esi (esi:=0x0400)
addb   $0x2,(%rdi) (LOCAL copy of lock is now: 0x0402)
movzwl (%rdi),%eax (local forwarding from previous store: eax := 0x0402)
testb  $0x1,%ah(no wakeup of anybody)
jne1f

callq  *__ticket_lock_spinning
  ...
  // __ticket_enter_slowpath(lock)
  lock or (%rdi), $0x100
  // (global view of lock := 0x0500)
...
  ACCESS_ONCE(lock->tickets.head) == want
  // (reads 0x00)
...
  xen_poll_irq(irq); // goes to sleep
...
[addb   $0x2,(%rdi)]
// (becomes globally visible only now! global view of lock := 0x0502)
...

Your code is reusing the (just about) safe version of unlocking a
spinlock without understanding the effect that close has on later
memory ordering. It may work on CPUs that cannot do narrow -> wide
store to load forwarding and have to make the addb store visible
globally. This is an implementation artifact of specific uarches, and
you mustn't rely on it, since our specified memory model allows looser
behaviour.

Since you want to get that addb out to global memory before the second
read, either use a LOCK prefix for it, add an MFENCE between addb and
movzwl, or use a LOCKed instruction that will have a fencing effect
(e.g., to top-of-stack)between addb and movzwl.

Stephan
-- 
Stephan Diestelhorst, AMD Operating System Research Center
stephan.diestelho...@amd.com
Tel. +49 (0)351 448 356 719

Advanced Micro Devices GmbH
Einsteinring 24
85609 Aschheim
Germany

Geschaeftsfuehrer: Alberto Bozzo 
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632, WEEE-Reg-Nr: DE 12919551

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: LPC2011 Virtualization Micro Conf

2011-09-27 Thread Jes Sorensen

Hi,

For those who are interested, I have posted the notes from the 2011
Linux Plumbers Virtualization micro conference here:

http://wiki.linuxplumbersconf.org/2011:virtualization

Slides can be found by clicking on the presentation and going onto the
Plumbers abstracts.

Cheers,
Jes
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] qemu-kvm: Switch POSIX compat AIO implementation to upstream

2011-09-27 Thread Avi Kivity

On 09/26/2011 09:09 PM, Anthony Liguori wrote:

On 09/26/2011 12:24 PM, Avi Kivity wrote:

On 09/26/2011 08:23 PM, Jan Kiszka wrote:

>
> Perhaps qemu_eventfd() can be used in the future instead of an 
explicit

> pipe. Then Linux will do eventfd while other OSes will fall back to
> pipes.

Basically simpler code, or does this also have runtime benefits?

In corner cases, the completion can block on the write() with pipes, 
but not

eventfd.

The pipe is O_NONBLOCK and the only thing the caller cares about is 
whether there is *some* data in the PIPE so it can happily ignore EAGAIN.

So the pipe implementation never blocks on write() either.  It may 
require multiple reads to drain the queue though whereas eventfd would 
only require a single read to drain the queue.

Oh, so switching to eventfd won't change much.  Still nice though IMO.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work

2011-09-27 Thread Robin Lee Powell

On Tue, Sep 27, 2011 at 01:48:43AM -0700, Robin Lee Powell wrote:
> On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote:
> > On 9/27/11, Robin Lee Powell  wrote:
> > > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin
> > > wrote:
> > >> It's unrelated to what you're actually using as the disks,
> > >> whether file or block devices like LVs. I think it just makes
> > >> KVM tell the host not to cache I/O done on the storage device.
> > >
> > > Wait, hold on, I think I had it backwards.
> > >
> > > It tells the *host* to not cache the device in question, or the
> > > *VMs* to not cache the device in question?
> > 
> > I'm fairly certain it tells the qemu not to cache the device in
> > question. If you don't want the guest to cache their i/o, then the
> > guest OS should be configured if it allows that. Although I'm not
> > sure if it's possible to disable disk buffering/caching system
> > wide in Linux.
> 
> OK, great, thanks.
> 
> Now if I could just figure out how to stop the host from swapping
> out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM
> left.  -_-  swappiness 0 doesn't seem to help there.

Grrr.

I turned swap off to clear it.  A few minutes ago, this host was at
zero swap:

top - 01:59:10 up 10 days, 15:17,  3 users,  load average: 6.39, 4.26, 3.24
Tasks: 151 total,   1 running, 150 sleeping,   0 stopped,   0 zombie
Cpu(s):  6.6%us,  1.0%sy,  0.0%ni, 85.9%id,  6.3%wa,  0.0%hi,  0.2%si,  0.0%st
Mem:   8128772k total,  656k used,  1617656k free,14800k buffers
Swap:  8388604k total,   672828k used,  7715776k free,97536k cached

  PID USER  PR  NI  VIRT  RES  SHR S %CPU %MEMTIME+  COMMAND
 2504 qemu  20   0 2425m 1.8g  448 S 10.0 23.4   3547:59 qemu-kvm
 2258 qemu  20   0 2425m 1.7g  444 S  2.7 21.7   1288:15 qemu-kvm
18061 qemu  20   0 2433m 1.8g  428 S  2.3 23.4 401:01.99 qemu-kvm
10335 qemu  20   0 1864m 861m  456 S  1.0 10.9   2:04.26 qemu-kvm
[snip]

Why is it doing this?!?  ;'(

(I don't know if anyone really has an answer, just wanted to rant)

-Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work

2011-09-27 Thread Robin Lee Powell

On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote:
> On 9/27/11, Robin Lee Powell  wrote:
> > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin
> > wrote:
> >> It's unrelated to what you're actually using as the disks,
> >> whether file or block devices like LVs. I think it just makes
> >> KVM tell the host not to cache I/O done on the storage device.
> >
> > Wait, hold on, I think I had it backwards.
> >
> > It tells the *host* to not cache the device in question, or the
> > *VMs* to not cache the device in question?
> 
> I'm fairly certain it tells the qemu not to cache the device in
> question. If you don't want the guest to cache their i/o, then the
> guest OS should be configured if it allows that. Although I'm not
> sure if it's possible to disable disk buffering/caching system
> wide in Linux.

OK, great, thanks.

Now if I could just figure out how to stop the host from swapping
out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM
left.  -_-  swappiness 0 doesn't seem to help there.

-Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work

2011-09-27 Thread Emmanuel Noobadmin

On 9/27/11, Robin Lee Powell  wrote:
> On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin wrote:
>> It's unrelated to what you're actually using as the disks, whether
>> file or block devices like LVs. I think it just makes KVM tell the
>> host not to cache I/O done on the storage device.
>
> Wait, hold on, I think I had it backwards.
>
> It tells the *host* to not cache the device in question, or the
> *VMs* to not cache the device in question?

I'm fairly certain it tells the qemu not to cache the device in
question. If you don't want the guest to cache their i/o, then the
guest OS should be configured if it allows that. Although I'm not sure
if it's possible to disable disk buffering/caching system wide in
Linux.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work

2011-09-27 Thread Robin Lee Powell

On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin wrote:
> On 9/26/11, Robin Lee Powell  wrote:
> > On Mon, Sep 26, 2011 at 01:49:18PM +0800, Emmanuel Noobadmin wrote:
> >> On 9/25/11, Robin Lee Powell  wrote:
> >> >
> >> > OK, so I've got a Linux host, and a bunch of Linux VMs.
> >> >
> >> > This means that the host *and* all tho VMs do their own disk
> >> > caches/buffers and do their own swap as well.
> >>
> >> If I'm not wrong, that's why the recommended and current default
> >> in libvirtd is to create storage devices with no caching to remove
> >> one layer of duplication.
> >
> > How do you do that?  I have my VMs using LVs created on the host as
> > their disks, but I'm open to other methods if there are significant
> > advantages.
> 
> It's unrelated to what you're actually using as the disks, whether
> file or block devices like LVs. I think it just makes KVM tell the
> host not to cache I/O done on the storage device. 

Wait, hold on, I think I had it backwards.

It tells the *host* to not cache the device in question, or the
*VMs* to not cache the device in question?

-Robin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] kvm tools: Support multiple net devices

2011-09-27 Thread Ingo Molnar


* Sasha Levin  wrote:

> --- a/tools/kvm/builtin-run.c
> +++ b/tools/kvm/builtin-run.c
> @@ -87,9 +87,12 @@ static bool sdl;
>  static bool balloon;
>  static bool using_rootfs;
>  static bool custom_rootfs;
> +static bool no_net;
>  extern bool ioport_debug;
>  extern int  active_console;
>  extern int  debug_iodelay;
> +struct virtio_net_parameters *net_params;
> +int num_net_devices;

Just a stylistic nit-pick: this variable definition section looks 
pretty ugly. Those externs should be in a header and in any case 
different types of lines should generally not be mixed without at 
least a newline between them.

> +static int set_net_param(struct virtio_net_parameters *p, const char *param,
> + const char *val)


Naming nit: 'struct virtio_net_params' is shorter by 4 chars and just 
as expressive.

> + char *buf, *cmd = NULL, *cur = NULL;
> + bool on_cmd = true;
> +
> + if (arg) {
> + buf = strdup(arg);
> + if (buf == NULL)
> + die("Failed allocating new net buffer");
> + cur = strtok(buf, ",=");
> + }
> +
> + p = (struct virtio_net_parameters) {
> + .guest_ip   = DEFAULT_GUEST_ADDR,
> + .host_ip= DEFAULT_HOST_ADDR,
> + .script = DEFAULT_SCRIPT,
> + .mode   = NET_MODE_TAP,
> + };
> +
> + str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac);
> + p.guest_mac[5] += num_net_devices;
> +
> + while (cur) {
> + if (on_cmd) {
> + cmd = cur;
> + } else {
> + if (set_net_param(&p, cmd, cur) < 0)
> + goto done;
> + }
> + on_cmd = !on_cmd;
> +
> + cur = strtok(NULL, ",=");
> + };
> +
> + num_net_devices++;
> +
> + net_params = realloc(net_params, num_net_devices * sizeof(*net_params));
> + if (net_params == NULL)
> + die("Failed adding new network device");
> +
> + net_params[num_net_devices - 1] = p;
> +
> +done:
> + return 0;
> +}

Isn't 'buf' leaked here?

Patch looks good otherwise.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb

2011-09-27 Thread Michael S. Tsirkin

On Tue, Sep 27, 2011 at 09:44:02AM +0300, Sasha Levin wrote:
> On Mon, 2011-09-26 at 22:55 +0300, Michael S. Tsirkin wrote:
> > On Mon, Sep 26, 2011 at 10:37:22PM +0300, Sasha Levin wrote:
> > > On Mon, 2011-09-26 at 21:44 +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Sep 26, 2011 at 08:41:08PM +0300, Sasha Levin wrote:
> > > > > This patch verifies that the length of a buffer stored in a linked 
> > > > > list
> > > > > of pages is small enough to fit into a skb.
> > > > > 
> > > > > If the size is larger than a max size of a skb, it means that we 
> > > > > shouldn't
> > > > > go ahead building skbs anyway since we won't be able to send the 
> > > > > buffer as
> > > > > the user requested.
> > > > > 
> > > > > Cc: Rusty Russell 
> > > > > Cc: "Michael S. Tsirkin" 
> > > > > Cc: virtualizat...@lists.linux-foundation.org
> > > > > Cc: net...@vger.kernel.org
> > > > > Cc: kvm@vger.kernel.org
> > > > > Signed-off-by: Sasha Levin 
> > > > 
> > > > Interesting.  This is a theoretical issue, correct?
> > > > Not a crash you actually see.
> > > 
> > > Actually it was an actual crash caused when our virtio-net driver in kvm
> > > tools did funny things and passed '(u32)-1' length as a buffer length to
> > > the guest kernel.
> > > 
> > > > This crash would mean device is giving us packets
> > > > that are way too large. Avoiding crashes even in the face of
> > > > a misbehaved device is a good idea, but should
> > > > we print a diagnostic to a system log?
> > > > Maybe rate-limited or print once to avoid filling
> > > > up the disk. Other places in driver print with pr_debug
> > > > I'm not sure that's right but better than nothing.
> > > 
> > > Yup, I'll add some debug info.
> > > 
> > > > > ---
> > > > >  drivers/net/virtio_net.c |3 +++
> > > > >  1 files changed, 3 insertions(+), 0 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index 0c7321c..64e0717 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -165,6 +165,9 @@ static struct sk_buff *page_to_skb(struct 
> > > > > virtnet_info *vi,
> > > > >   unsigned int copy, hdr_len, offset;
> > > > >   char *p;
> > > > >  
> > > > > + if (len > MAX_SKB_FRAGS * PAGE_SIZE)
> > > > 
> > > > unlikely()?
> > > > 
> > > > Also, this seems too aggressive: at this point len includes the header
> > > > and the linear part. The right place for this
> > > > test is probably where we fill in the frags, just before
> > > > while (len)
> > > > 
> > > > The whole can only happen when mergeable buffers
> > > > are disabled, right?
> > > 
> > > >From what I understand it can happen whenever you're going to build a
> > > skb longer than PAGE_SIZE.
> > 
> > Hmm how exactly?  With mergeable buffers this only gets
> > the length of the 1st chunk which is up to 4K unless the driver
> > is buggy ...
> 
> What about the case where TSO or ECN features are set? The code flow
> suggests that mergeable would get ignored in that case:
> 
>   /* If we can receive ANY GSO packets, we must allocate large ones. */
>   if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
>   virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
>   virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
>   vi->big_packets = true;
> 
>   [...]
> 
>   if (!vi->mergeable_rx_bufs && !vi->big_packets) {
>   skb = buf;
>   len -= sizeof(struct virtio_net_hdr);
>   skb_trim(skb, len);
>   } else {
>   page = buf;
>   skb = page_to_skb(vi, page, len);
>   ...

Sorry I don't get it yet. Where is mergeable ignored here?

> I haven't actually tested it with mergeable buffers enabled, but could
> do it later today.

Please do.

> -- 
> 
> Sasha.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

39 matches

Mail list logo