[PATCH v4] kvm tools: Support multiple net devices
This patch adds support for multiple network devices. The command line syntax changes to the following: --network/-n [mode=tap/user/none][,guest_ip=ip][,host_ip= ip][,guest_mac=mac][,script=file] Each of the parameters is optional, and the config defaults to a TAP based networking with a sequential MAC. Signed-off-by: Sasha Levin --- tools/kvm/builtin-run.c| 147 - tools/kvm/include/kvm/virtio-net.h |4 +- tools/kvm/virtio/net.c | 159 3 files changed, 197 insertions(+), 113 deletions(-) diff --git a/tools/kvm/builtin-run.c b/tools/kvm/builtin-run.c index 28dc95a..a88503f 100644 --- a/tools/kvm/builtin-run.c +++ b/tools/kvm/builtin-run.c @@ -65,6 +65,7 @@ __thread struct kvm_cpu *current_kvm_cpu; static u64 ram_size; static u8 image_count; +static u8 num_net_devices; static bool virtio_rng; static const char *kernel_cmdline; static const char *kernel_filename; @@ -80,6 +81,7 @@ static const char *guest_mac; static const char *host_mac; static const char *script; static const char *guest_name; +static struct virtio_net_params *net_params; static bool single_step; static bool readonly_image[MAX_DISK_IMAGES]; static bool vnc; @@ -87,6 +89,7 @@ static bool sdl; static bool balloon; static bool using_rootfs; static bool custom_rootfs; +static bool no_net; extern bool ioport_debug; extern int active_console; extern int debug_iodelay; @@ -182,6 +185,90 @@ static int tty_parser(const struct option *opt, const char *arg, int unset) return 0; } +static inline void str_to_mac(const char *str, char *mac) +{ + sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + mac, mac+1, mac+2, mac+3, mac+4, mac+5); +} +static int set_net_param(struct virtio_net_params *p, const char *param, + const char *val) +{ + if (strcmp(param, "guest_mac") == 0) { + str_to_mac(val, p->guest_mac); + } else if (strcmp(param, "mode") == 0) { + if (!strncmp(val, "user", 4)) { + int i; + + for (i = 0; i < num_net_devices; i++) + if (net_params[i].mode == NET_MODE_USER) + die("Only one usermode network device allowed at a time"); + p->mode = NET_MODE_USER; + } else if (!strncmp(val, "tap", 3)) { + p->mode = NET_MODE_TAP; + } else if (!strncmp(val, "none", 4)) { + no_net = 1; + return -1; + } else + die("Unkown network mode %s, please use user, tap or none", network); + } else if (strcmp(param, "script") == 0) { + p->script = strdup(val); + } else if (strcmp(param, "guest_ip") == 0) { + p->guest_ip = strdup(val); + } else if (strcmp(param, "host_ip") == 0) { + p->host_ip = strdup(val); + } + + return 0; +} + +static int netdev_parser(const struct option *opt, const char *arg, int unset) +{ + struct virtio_net_params p; + char *buf = NULL, *cmd = NULL, *cur = NULL; + bool on_cmd = true; + + if (arg) { + buf = strdup(arg); + if (buf == NULL) + die("Failed allocating new net buffer"); + cur = strtok(buf, ",="); + } + + p = (struct virtio_net_params) { + .guest_ip = DEFAULT_GUEST_ADDR, + .host_ip= DEFAULT_HOST_ADDR, + .script = DEFAULT_SCRIPT, + .mode = NET_MODE_TAP, + }; + + str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac); + p.guest_mac[5] += num_net_devices; + + while (cur) { + if (on_cmd) { + cmd = cur; + } else { + if (set_net_param(&p, cmd, cur) < 0) + goto done; + } + on_cmd = !on_cmd; + + cur = strtok(NULL, ",="); + }; + + num_net_devices++; + + net_params = realloc(net_params, num_net_devices * sizeof(*net_params)); + if (net_params == NULL) + die("Failed adding new network device"); + + net_params[num_net_devices - 1] = p; + +done: + free(buf); + return 0; +} + static int shmem_parser(const struct option *opt, const char *arg, int unset) { const u64 default_size = SHMEM_DEFAULT_SIZE; @@ -339,18 +426,9 @@ static const struct option options[] = { "Kernel command line arguments"), OPT_GROUP("Networking options:"), - OPT_STRING('n', "network", &network, "user, tap, none", - "Network to use"), - OPT_STRING('\0', "host-ip", &host_ip, "a.b.c.d", - "Assign this address to the host side networking")
Re: [Qemu-devel] RFC [v2]: vfio / device assignment -- layout of device fd files
On Tue, 2011-09-27 at 16:28 -0500, Scott Wood wrote: > On 09/26/2011 07:45 PM, Alex Williamson wrote: > > On Mon, 2011-09-26 at 18:59 -0500, Scott Wood wrote: > >> On 09/26/2011 01:34 PM, Alex Williamson wrote: > >>> /* Reset the device */ > >>> #define VFIO_DEVICE_RESET _IO(, ,) > >> > >> What generic way do we have to do this? We should probably have a way > >> to determine whether it's possible, without actually asking to do it. > > > > It's not generic, it could be a VFIO_DEVICE_PCI_RESET or we could add a > > bit to the device flags to indicate if it's available or we could add a > > "probe" arg to the ioctl to either check for existence or do it. > > Even with PCI, isn't this only possible if function-level reset is > supported? There are a couple other things we can do if FLR isn't present (D3hot transition, secondary bus reset, device specific resets are possible). > I think we need a flag. Ok, PCI has a pci_probe_reset_function() and pci_reset_function(). I'd probably mimic those in the vfio device ops. Common vfio code can probe the reset and set the flag appropriately and we can have a common VFIO_DEVICE_RESET ioctl that calls into the device ops reset function. > For devices that can't be reset by the kernel, we'll want the ability to > stop/start DMA acccess through the IOMMU (or other bus-specific means), > separate from whether the fd is open. If a device is assigned to a > partition and that partition gets reset, we'll want to disable DMA > before we re-use the memory, and enable it after the partition has reset > or quiesced the device (which requires the fd to be open). Maybe this can be accomplished via an iommu_detach_device() to temporarily disassociate it from the domain. We could also unmap all the DMA. Anyway, a few possibilities. > >>> /* PCI MSI setup, arg[0] = #, arg[1-n] = eventfds */ > >>> #define VFIO_DEVICE_PCI_SET_MSI_EVENTFDS _IOW(, , int) > >>> #define VFIO_DEVICE_PCI_SET_MSIX_EVENTFDS _IOW(, , int) > >>> > >>> Hope that covers it. > >> > >> It could be done this way, but I predict that the code (both kernel and > >> user side) will be larger. Maybe not much more complex, but more > >> boilerplate. > >> > >> How will you manage extensions to the interface? > > > > I would assume we'd do something similar to the kvm capabilities checks. > > This information is already built into the data-structure approach. If we define it to be part of the flags, then it's built-in to the ioctl approach too... > >> The table should not be particularly large, and you'll need to keep the > >> information around in some form regardless. Maybe in the PCI case you > >> could produce it dynamically (though I probably wouldn't), but it really > >> wouldn't make sense in the device tree case. > > > > It would be entirely dynamic for PCI, there's no advantage to caching > > it. Even for device tree, if you can't fetch it dynamically, you'd have > > to duplicate it between an internal data structure and a buffer reading > > the table. > > I don't think we'd need to keep the device tree path/index info around > for anything but the table -- but really, this is a minor consideration. > > >> You also lose the ability to easily have a human look at the hexdump for > >> debugging; you'll need a special "lsvfio" tool. You might want one > >> anyway to pretty-print the info, but with ioctls it's mandatory. > > > > I don't think this alone justifies duplicating the data and making it > > difficult to parse on both ends. Chances are we won't need such a tool > > for the ioctl interface because it's easier to get it right the first > > time ;) > > It's not just useful for getting the code right, but for e.g. sanity > checking that the devices were bound properly. I think such a tool > would be generally useful, no matter what the kernel interface ends up > being. I don't just use lspci to debug the PCI subsystem. :-) This is also a minor consideration. Looking at hexdumps isn't much to rely on for debugging and if we take the step of writing a tool, it's not much harder to write for either interface. The table is more akin to dumping the data, but I feel the ioctl is easier for how a driver would probably make use of the data (linear vs random access). > > Note that I'm not stuck on this interface, I was just thinking about how > > to generate the table last week, it seemed like a pain so I thought I'd > > spend a few minutes outlining an ioctl interface... turns out it's not > > so bad. Thanks, > > Yeah, it can work either way, as long as the information's there and > there's a way to add new bits of information, or new bus types, down the > road. Mainly a matter of aesthetics between the two. It'd be nice if David would chime back in since he objected to the table. Does an ioctl interface look better? Alex Graf, any opinions? Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.ke
tcpdump locks up kvm host for a while.
I can essentially test this at will, so feel free to ask for debugging steps. My host and VMs are all Fedora 15. Every time I run tcpdump on a VM, it hangs for a while. It uses all available CPU, "virsh list" hangs trying to show it, and I can't shut it down except by killing the qemu-kvm process. After a few minutes, which amount of time seems to be proportionate to how long it's been since I last ran tcpdump (the longer it's been, the longer I have to wait) everything goes back to normal if I don't kill the VM. Any idea what's going on there? -Robin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] Weird console issue.
On Mon, Sep 26, 2011 at 01:45:07AM -0700, Robin Lee Powell wrote: > Everything seems to work until it asks for a password. As soon as > that happens, any character I type causes it to respond as though > I'd hit "enter". This continues until /bin/login completes and a > new getty starts, at which point it prints the right headers and I > can enter my userid, and then it asks for a password and it breaks > again in the same way. Bizarrely, the problem appears to have actually been in my kernel config: kernel /vmlinuz-2.6.38.8-32.fc15.x86_64 ro root=/dev/vda2 rd_NO_LUKS rd_NO_LVM rd_NO_MD rd_NO_DM LANG=en_US.UTF-8 SYSFONT=latarcyrheb-sun16 KEYTABLE=us console=ttyS0,115200 console=tty0 rhgb Apparently, having the two console bits screwed things up, which is kind of lame. But hey, problem solved! :D -Robin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
order 1 page allocation failures
Hi, I've been having some issues with KVM recently where one or more vms will cause page allocation failure messages, usually with the backtrace including networking functions, example follows: [362409.429944] kvm: page allocation failure: order:1, mode:0x20 [362409.429957] Pid: 3453, comm: kvm Not tainted 3.0.0-1-amd64 #1 [362409.429965] Call Trace: [362409.429970][] ? warn_alloc_failed+0x108/0x11b [362409.429998] [] ? __alloc_pages_nodemask+0x6e6/0x75c [362409.430012] [] ? kmem_getpages+0x55/0xf0 [362409.430022] [] ? fallback_alloc+0x129/0x1c1 [362409.430035] [] ? paravirt_read_tsc+0x5/0x8 [362409.430045] [] ? kmem_cache_alloc+0x73/0xf0 [362409.430057] [] ? sk_prot_alloc+0x2b/0x128 [362409.430067] [] ? sk_clone+0x14/0x2bd [362409.430077] [] ? inet_csk_clone+0x10/0x91 [362409.430088] [] ? tcp_create_openreq_child+0x21/0x41a [362409.430099] [] ? tcp_v4_syn_recv_sock+0x33/0x208 [362409.430110] [] ? tcp_check_req+0x1ff/0x2dd [362409.430122] [] ? inet_csk_search_req+0x35/0xa7 [362409.430132] [] ? tcp_v4_do_rcv+0x206/0x32c [362409.430144] [] ? tcp_v4_rcv+0x419/0x66c [362409.430154] [] ? native_sched_clock+0x28/0x30 [362409.430173] [] ? ip_local_deliver_finish+0x14b/0x1bb [362409.430186] [] ? __netif_receive_skb+0x3d7/0x40b [362409.430197] [] ? netif_receive_skb+0x52/0x58 [362409.430220] [] ? br_nf_pre_routing_finish+0x1d4/0x1e1 [bridge] [362409.430241] [] ? NF_HOOK_THRESH+0x3b/0x55 [bridge] [362409.430260] [] ? br_nf_pre_routing+0x3be/0x3cb [bridge] [362409.430272] [] ? nf_iterate+0x41/0x77 [362409.430288] [] ? NF_HOOK.clone.4+0x56/0x56 [bridge] [362409.430305] [] ? NF_HOOK.clone.4+0x56/0x56 [bridge] [362409.430315] [] ? nf_hook_slow+0x73/0x111 [362409.430330] [] ? NF_HOOK.clone.4+0x56/0x56 [bridge] [362409.430342] [] ? try_to_wake_up+0x199/0x199 [362409.430358] [] ? NF_HOOK.clone.4+0x56/0x56 [bridge] [362409.430375] [] ? NF_HOOK.clone.4+0x3c/0x56 [bridge] [362409.430392] [] ? br_handle_frame+0x1af/0x1c6 [bridge] [362409.430408] [] ? br_handle_frame_finish+0x1f3/0x1f3 [bridge] [362409.430420] [] ? __netif_receive_skb+0x2c4/0x40b [362409.430432] [] ? process_backlog+0x78/0x157 [362409.430443] [] ? net_rx_action+0xa4/0x1b2 [362409.430454] [] ? test_tsk_need_resched+0xe/0x17 [362409.430465] [] ? __do_softirq+0xb9/0x178 [362409.430476] [] ? call_softirq+0x1c/0x30 [362409.430481][] ? do_softirq+0x3f/0x84 [362409.430498] [] ? netif_rx_ni+0x1e/0x27 [362409.430509] [] ? tun_get_user+0x390/0x3b8 [tun] [362409.430520] [] ? bit_waitqueue+0x71/0xa4 [362409.430529] [] ? _flat_send_IPI_mask+0x6a/0x7c [362409.430541] [] ? tun_get_socket+0x3b/0x3b [tun] [362409.430552] [] ? tun_chr_aio_write+0x5e/0x79 [tun] [362409.430563] [] ? do_sync_readv_writev+0x9a/0xd5 [362409.430574] [] ? need_resched+0x1a/0x23 [362409.430585] [] ? _cond_resched+0x9/0x20 [362409.430596] [] ? copy_from_user+0x18/0x30 [362409.430608] [] ? security_file_permission+0x18/0x33 [362409.430618] [] ? do_readv_writev+0xa4/0x11a [362409.430627] [] ? fput+0x1a/0x1a2 [362409.430636] [] ? sys_writev+0x45/0x90 [362409.430647] [] ? system_call_fastpath+0x16/0x1b [362409.430654] Mem-Info: [362409.430659] Node 0 DMA per-cpu: [362409.430667] CPU0: hi:0, btch: 1 usd: 0 [362409.430673] CPU1: hi:0, btch: 1 usd: 0 [362409.430679] CPU2: hi:0, btch: 1 usd: 0 [362409.430686] CPU3: hi:0, btch: 1 usd: 0 [362409.430691] Node 0 DMA32 per-cpu: [362409.430699] CPU0: hi: 186, btch: 31 usd: 147 [362409.430705] CPU1: hi: 186, btch: 31 usd: 82 [362409.430712] CPU2: hi: 186, btch: 31 usd: 171 [362409.430718] CPU3: hi: 186, btch: 31 usd: 86 [362409.430724] Node 0 Normal per-cpu: [362409.430730] CPU0: hi: 186, btch: 31 usd: 157 [362409.430737] CPU1: hi: 186, btch: 31 usd: 96 [362409.430743] CPU2: hi: 186, btch: 31 usd: 163 [362409.430749] CPU3: hi: 186, btch: 31 usd: 173 [362409.430764] active_anon:576670 inactive_anon:110462 isolated_anon:0 [362409.430769] active_file:615766 inactive_file:638287 isolated_file:0 [362409.430774] unevictable:0 dirty:3203 writeback:0 unstable:0 [362409.430778] free:31113 slab_reclaimable:36977 slab_unreclaimable:11009 [362409.430783] mapped:11738 shmem:226 pagetables:9104 bounce:0 [362409.430791] Node 0 DMA free:15912kB min:128kB low:160kB high:192kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15688kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes [362409.430827] lowmem_reserve[]: 0 3254 8051 8051 [362409.430837] Node 0 DMA32 free:57588kB min:27260kB low:34072kB high:40888kB active_anon:454620kB inactive_anon:92920kB active_file:1279680kB inactive_file:1348708kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:3332192kB mlock
Re: How many threads should a kvm vm be starting?
On September 27, 2011, Avi Kivity wrote: > On 09/27/2011 03:29 AM, Thomas Fjellstrom wrote: > > I just noticed something interesting, a virtual machine on one of my > > servers seems to have 69 threads (including the main thread). Other > > guests on the machine only have a couple threads. > > > > Is this normal? or has something gone horribly wrong? > > It's normal if the guest does a lot of I/O. The thread count should go > down when the guest idles. Ah, that would make sense. Though it kind of defeats assigning a vm a single cpu/core. A single VM can now DOS an entire multi-core-cpu server. It pretty much pegged my dual core (with HT) server for a couple hours. -- Thomas Fjellstrom tho...@fjellstrom.ca -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] RFC [v2]: vfio / device assignment -- layout of device fd files
On 09/26/2011 07:45 PM, Alex Williamson wrote: > On Mon, 2011-09-26 at 18:59 -0500, Scott Wood wrote: >> On 09/26/2011 01:34 PM, Alex Williamson wrote: >>> /* Reset the device */ >>> #define VFIO_DEVICE_RESET _IO(, ,) >> >> What generic way do we have to do this? We should probably have a way >> to determine whether it's possible, without actually asking to do it. > > It's not generic, it could be a VFIO_DEVICE_PCI_RESET or we could add a > bit to the device flags to indicate if it's available or we could add a > "probe" arg to the ioctl to either check for existence or do it. Even with PCI, isn't this only possible if function-level reset is supported? I think we need a flag. For devices that can't be reset by the kernel, we'll want the ability to stop/start DMA acccess through the IOMMU (or other bus-specific means), separate from whether the fd is open. If a device is assigned to a partition and that partition gets reset, we'll want to disable DMA before we re-use the memory, and enable it after the partition has reset or quiesced the device (which requires the fd to be open). >>> /* PCI MSI setup, arg[0] = #, arg[1-n] = eventfds */ >>> #define VFIO_DEVICE_PCI_SET_MSI_EVENTFDS_IOW(, , int) >>> #define VFIO_DEVICE_PCI_SET_MSIX_EVENTFDS _IOW(, , int) >>> >>> Hope that covers it. >> >> It could be done this way, but I predict that the code (both kernel and >> user side) will be larger. Maybe not much more complex, but more >> boilerplate. >> >> How will you manage extensions to the interface? > > I would assume we'd do something similar to the kvm capabilities checks. This information is already built into the data-structure approach. >> The table should not be particularly large, and you'll need to keep the >> information around in some form regardless. Maybe in the PCI case you >> could produce it dynamically (though I probably wouldn't), but it really >> wouldn't make sense in the device tree case. > > It would be entirely dynamic for PCI, there's no advantage to caching > it. Even for device tree, if you can't fetch it dynamically, you'd have > to duplicate it between an internal data structure and a buffer reading > the table. I don't think we'd need to keep the device tree path/index info around for anything but the table -- but really, this is a minor consideration. >> You also lose the ability to easily have a human look at the hexdump for >> debugging; you'll need a special "lsvfio" tool. You might want one >> anyway to pretty-print the info, but with ioctls it's mandatory. > > I don't think this alone justifies duplicating the data and making it > difficult to parse on both ends. Chances are we won't need such a tool > for the ioctl interface because it's easier to get it right the first > time ;) It's not just useful for getting the code right, but for e.g. sanity checking that the devices were bound properly. I think such a tool would be generally useful, no matter what the kernel interface ends up being. I don't just use lspci to debug the PCI subsystem. :-) > Note that I'm not stuck on this interface, I was just thinking about how > to generate the table last week, it seemed like a pain so I thought I'd > spend a few minutes outlining an ioctl interface... turns out it's not > so bad. Thanks, Yeah, it can work either way, as long as the information's there and there's a way to add new bits of information, or new bus types, down the road. Mainly a matter of aesthetics between the two. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [KVM-autotest][PATCH] cgroup test with KVM guest +first subtests
On 09/22/2011 01:29 PM, Lukas Doktor wrote: Hi guys, Do you remember the discussion about cgroup testing in autotest vs. LTP? I hope there won't be any doubts about this one as ground_test (+ first 2 subtests) are strictly focused on cgroups features enforced on KVM guest systems. Also more subtests will follow if you approve the test structure (blkio_throttle, memory, cpus...). No matter whether we drop or keep the general 'cgroup' test. The 'cgroup_common.py' library can be imported either from 'client/tests/cgroup/' directory or directly from 'client/tests/kvm/tests/' directory. The modifications of 'cgroup_common.py' library is backward compatible with general cgroup test. See the commits for details. Ok, after quite a bit of back and forth, pull request merged, thanks Lukas! Regards, Lukáš Doktor -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware
On Tue, Sep 27, 2011 at 09:28:37AM -0400, Ohad Ben-Cohen wrote: > So you're suggesting to re-implement find_next_bit() using ffs()/fls() > and shifting ? No. I suggest a simpler and shorter algorithm using the bit helpers. Something like that: min_idx = __ffs(iommu_page_sizes); while (size) { /* Max alignment allowed by current physical address */ phys_idx = __ffs(phys); /* Max alignment allowed by current size */ size_idx = __fls(size); /* special case: iova == 0 */ if (likely(phys)) idx = min(phys_idx, size_idx); else idx = size_idx; BUG_ON(idx < min_idx); psize = 1UL << idx; /* search next smaller page-size supported */ while (psize && !(iommu_page_sizes & psize)) psize >>= 1; BUG_ON(psize == 0); iommu_ops->map(domain, iova, phys, get_order(psize), prot); iova += psize; phys += psize; size -= psize; } It is only C-style pseudo-code, of course. These __ffs and __fls lines all translate to a single instruction later. The find_next_bit() function has a lot more overhead because it needs to take account of real bitmaps (arrays of ulong). But this complexity is not required here. And yes, overhead is important when we implement the generic dma-ops on-top of the iommu-api because this will make the iommu_map function a fast-path. So we really care about overhead here. Joerg -- AMD Operating System Research Center Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach General Managers: Alberto Bozzo, Andrew Bowd Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Qemu/KVM is 3x slower under libvirt
I repost this, this time by also including the libvirt mailing list. Info on my libvirt: it's the version in Ubuntu 11.04 Natty which is 0.8.8-1ubuntu6.5 . I didn't recompile this one, while Kernel and qemu-kvm are vanilla and compiled by hand as described below. My original message follows: This is really strange. I just installed a new host with kernel 3.0.3 and Qemu-KVM 0.14.1 compiled by me. I have created the first VM. This is on LVM, virtio etc... if I run it directly from bash console, it boots in 8 seconds (it's a bare ubuntu with no graphics), while if I boot it under virsh (libvirt) it boots in 20-22 seconds. This is the time from after Grub to the login prompt, or from after Grub to the ssh-server up. I was almost able to replicate the whole libvirt command line on the bash console, and it still goes almost 3x faster when launched from bash than with virsh start vmname. The part I wasn't able to replicate is the -netdev part because I still haven't understood the semantics of it. This is my bash commandline: /opt/qemu-kvm-0.14.1/bin/qemu-system-x86_64 -M pc-0.14 -enable-kvm -m 2002 -smp 2,sockets=2,cores=1,threads=1 -name vmname1-1 -uuid ee75e28a-3bf3-78d9-3cba-65aa63973380 -nodefconfig -nodefaults -chardev socket,id=charmonitor,path=/var/lib/libvirt/qemu/vmname1-1.monitor,server,nowait -mon chardev=charmonitor,id=monitor,mode=readline -rtc base=utc -boot order=dc,menu=on -drive file=/dev/mapper/vgPtpVM-lvVM_Vmname1_d1,if=none,id=drive-virtio-disk0,boot=on,format=raw,cache=none,aio=native -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -drive if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw,cache=none,aio=native -device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -net nic,model=virtio -net tap,ifname=tap0,script=no,downscript=no -usb -vnc 127.0.0.1:0 -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 Which was taken from libvirt's command line. The only modifications I did to the original libvirt commandline (seen with ps aux) were: - Removed -S - Network was: -netdev tap,fd=17,id=hostnet0,vhost=on,vhostfd=18 -device virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:05:36:60,bus=pci.0,addr=0x3 Has been simplified to: -net nic,model=virtio -net tap,ifname=tap0,script=no,downscript=no and manual bridging of the tap0 interface. Firstly I had thought that this could be fault of the VNC: I have compiled qemu-kvm with no separate vnc thread. I thought that libvirt might have connected to the vnc server at all times and this could have slowed down the whole VM. But then I also tried connecting vith vncviewer to the KVM machine launched directly from bash, and the speed of it didn't change. So no, it doesn't seem to be that. BTW: is the slowdown of the VM on "no separate vnc thread" only in effect when somebody is actually connected to VNC, or always? Also, note that the time difference is not visible in dmesg once the machine has booted. So it's not a slowdown in detecting devices. Devices are always detected within the first 3 seconds, according to dmesg, at 3.6 seconds the first ext4 mount begins. It seems to be really the OS boot that is slow... it seems an hard disk performance problem. Thank you R. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4] KVM test: unattended_install: Remove unused variable images_dir
Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/tests/unattended_install.py |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/client/tests/kvm/tests/unattended_install.py b/client/tests/kvm/tests/unattended_install.py index 7d90a91..7655bc1 100644 --- a/client/tests/kvm/tests/unattended_install.py +++ b/client/tests/kvm/tests/unattended_install.py @@ -211,7 +211,6 @@ class UnattendedInstallConfig(object): @param params: Dictionary with test parameters. """ root_dir = test.bindir -images_dir = os.path.join(root_dir, 'images') self.deps_dir = os.path.join(root_dir, 'deps') self.unattended_dir = os.path.join(root_dir, 'unattended') -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] KVM test: unattended_install: Fix the background migration code
Currently we use initrd and kernel params to perform unattended install in all supported linux guests. Before that, we used to remove those params from the qemu params because they alledgedly 'broke' the migration. Well, turns out this code renders our migrate in background code path useless, and it does not appear to break migration on the tests I've made. So, let's just get rid of it and be happy. Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/tests/unattended_install.py | 10 -- 1 files changed, 0 insertions(+), 10 deletions(-) diff --git a/client/tests/kvm/tests/unattended_install.py b/client/tests/kvm/tests/unattended_install.py index 023d00f..b1d23f6 100644 --- a/client/tests/kvm/tests/unattended_install.py +++ b/client/tests/kvm/tests/unattended_install.py @@ -611,16 +611,6 @@ def run_unattended_install(test, params, env): pass if migrate_background: -# Drop the params which may break the migration -# Better method is to use dnsmasq to do the -# unattended installation -if vm.params.get("initrd"): -vm.params["initrd"] = None -if vm.params.get("kernel"): -vm.params["kernel"] = None -if vm.params.get("extra_params"): -vm.params["extra_params"] = re.sub("--append '.*'", "", - vm.params["extra_params"]) vm.migrate(timeout=mig_timeout, protocol=mig_protocol) else: time.sleep(1) -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4] KVM test: unattended_install: Remove unused variable post_install_delay
Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/tests/unattended_install.py |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/client/tests/kvm/tests/unattended_install.py b/client/tests/kvm/tests/unattended_install.py index 7655bc1..023d00f 100644 --- a/client/tests/kvm/tests/unattended_install.py +++ b/client/tests/kvm/tests/unattended_install.py @@ -580,7 +580,6 @@ def run_unattended_install(test, params, env): vm.create() install_timeout = int(params.get("timeout", 3000)) -post_install_delay = int(params.get("post_install_delay", 0)) port = vm.get_port(int(params.get("guest_port_unattended_install"))) migrate_background = params.get("migrate_background") == "yes" -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] KVM test: unattended_install: Fix unused variable warning
Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/tests/unattended_install.py |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/client/tests/kvm/tests/unattended_install.py b/client/tests/kvm/tests/unattended_install.py index 1ff73af..7d90a91 100644 --- a/client/tests/kvm/tests/unattended_install.py +++ b/client/tests/kvm/tests/unattended_install.py @@ -91,6 +91,7 @@ class FloppyDisk(Disk): m_cmd = 'mount -o loop,rw %s %s' % (path, self.mount) utils.run(m_cmd) except error.CmdError, e: +logging.error("Error during floppy initialization: %s" % e) cleanup(self.mount) raise -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4] KVM test: Unattended install fixes
Here we have a handful of fixes for the unattended install code. In special, with the last patch we recover the ability to run ping pong migration in the background during the unattended install for linux guests. This patchset was also sent as a pull request on gitub. Feel free to make comments there: https://github.com/autotest/autotest/pull/24 Lucas Meneghel Rodrigues (4): KVM test: unattended_install: Fix unused variable warning KVM test: unattended_install: Remove unused variable images_dir KVM test: unattended_install: Remove unused variable post_install_delay KVM test: unattended_install: Fix the background migration code client/tests/kvm/tests/unattended_install.py | 13 + 1 files changed, 1 insertions(+), 12 deletions(-) -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks
On 09/27/2011 02:34 AM, Stephan Diestelhorst wrote: > On Wednesday 14 September 2011, 17:31:32 Jeremy Fitzhardinge wrote: >> This series replaces the existing paravirtualized spinlock mechanism >> with a paravirtualized ticketlock mechanism. > [...] >> The unlock code is very straightforward: >> prev = *lock; >> __ticket_unlock_release(lock); >> if (unlikely(__ticket_in_slowpath(lock))) >> __ticket_unlock_slowpath(lock, prev); >> >> which generates: >> push %rbp >> mov%rsp,%rbp >> >> movzwl (%rdi),%esi >> addb $0x2,(%rdi) >> movzwl (%rdi),%eax >> testb $0x1,%ah >> jne1f >> >> pop%rbp >> retq >> >> ### SLOWPATH START >> 1: movzwl (%rdi),%edx >> movzbl %dh,%ecx >> mov%edx,%eax >> and$-2,%ecx # clear TICKET_SLOWPATH_FLAG >> mov%cl,%dh >> cmp%dl,%cl # test to see if lock is uncontended >> je 3f >> >> 2: movzbl %dl,%esi >> callq *__ticket_unlock_kick# kick anyone waiting >> pop%rbp >> retq >> >> 3: lock cmpxchg %dx,(%rdi) # use cmpxchg to safely write back flag >> jmp2b >> ### SLOWPATH END > [...] >> Thoughts? Comments? Suggestions? > You have a nasty data race in your code that can cause a losing > acquirer to sleep forever, because its setting the TICKET_SLOWPATH flag > can race with the lock holder releasing the lock. > > I used the code for the slow path from the GIT repo. > > Let me try to point out an interleaving: > > Lock is held by one thread, contains 0x0200. > > _Lock holder_ _Acquirer_ > mov$0x200,%eax > lock xadd %ax,(%rdi) > // ax:= 0x0200, lock:= 0x0400 > ... > // this guy spins for a while, reading > // the lock > ... > //trying to free the lock > movzwl (%rdi),%esi (esi:=0x0400) > addb $0x2,(%rdi) (LOCAL copy of lock is now: 0x0402) > movzwl (%rdi),%eax (local forwarding from previous store: eax := 0x0402) > testb $0x1,%ah(no wakeup of anybody) > jne1f > > callq *__ticket_lock_spinning > ... > // __ticket_enter_slowpath(lock) > lock or (%rdi), $0x100 > // (global view of lock := 0x0500) > ... > ACCESS_ONCE(lock->tickets.head) == want > // (reads 0x00) > ... > xen_poll_irq(irq); // goes to sleep > ... > [addb $0x2,(%rdi)] > // (becomes globally visible only now! global view of lock := 0x0502) > ... > > Your code is reusing the (just about) safe version of unlocking a > spinlock without understanding the effect that close has on later > memory ordering. It may work on CPUs that cannot do narrow -> wide > store to load forwarding and have to make the addb store visible > globally. This is an implementation artifact of specific uarches, and > you mustn't rely on it, since our specified memory model allows looser > behaviour. Ah, thanks for this observation. I've seen this bug before when I didn't pay attention to the unlock W vs flag R ordering at all, and I was hoping the aliasing would be sufficient - and certainly this seems to have been OK on my Intel systems. But you're saying that it will fail on current AMD systems? Have you tested this, or is this just from code analysis (which I agree with after reviewing the ordering rules in the Intel manual). > Since you want to get that addb out to global memory before the second > read, either use a LOCK prefix for it, add an MFENCE between addb and > movzwl, or use a LOCKed instruction that will have a fencing effect > (e.g., to top-of-stack)between addb and movzwl. Hm. I don't really want to do any of those because it will probably have a significant effect on the unlock performance; I was really trying to avoid adding any more locked instructions. A previous version of the code had an mfence in here, but I hit on the idea of using aliasing to get the ordering I want - but overlooked the possible effect of store forwarding. I guess it comes down to throwing myself on the efficiency of some kind of fence instruction. I guess an lfence would be sufficient; is that any more efficient than a full mfence? At least I can make it so that its only present when pv ticket locks are actually in use, so it won't affect the native case. Could you give me a pointer to AMD's description of the ordering rules? Thanks, J -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majo
Re: [PATCH 0/4] acpi: fix up EJ0 in DSDT
On 09/27/2011 03:04 PM, Michael S. Tsirkin wrote: >There was a concern raised with > two-pass PCI initialization that I need to follow up on before > tagging. The isa bridge? I thought that got fixed ... Daniel Berrange reported a regression with virtio-9p. Paolo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm linux-next tree
Hi Avi, On Tue, 27 Sep 2011 15:25:02 +0300 Avi Kivity wrote: > > Please add > >git://github.com/avikivity/kvm.git kvm-updates/3.2 > > to the linux-next collection, until kernel.org is back up. Thanks. I will switch to this tomorrow. -- Cheers, Stephen Rothwells...@canb.auug.org.au http://www.canb.auug.org.au/~sfr/ pgpsSrtlYk2CI.pgp Description: PGP signature
Re: KVM call agenda for September 26
Juan Quintela wrote: > Hi > > Please send in any agenda items you are interested in covering. > > Thanks, Juan. As there are no agenda for today, call gets cancelled. Happy hacking, Juan. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware
On Tue, Sep 27, 2011 at 4:12 PM, Roedel, Joerg wrote: > You pass a pointer to an unsigned long for the page-size bitmap. This > allows to use an array of unsigned long. But a single unsigned long is > sufficient This is fine; I can change that if you like it better (though without doing the change below this is probably moot). > and you can use functions like ffs() and fls() together with > shifting. These functions often translate to a single intruction in the > binary. The find_next_bit function has much more overhead because it > needs to handle the array-of-ulong case. So you're suggesting to re-implement find_next_bit() using ffs()/fls() and shifting ? What's the point ? Sure, if we'll have a proven performance issue while using find_next_bit() we can think of doing this, but at this stage, this sounds to me like a premature optimization which isn't too elegant. At this point I strongly prefer readable, maintainable and easier to debug code over optimization which isn't proven to be necessary. Thanks, Ohad. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware
On Tue, Sep 27, 2011 at 08:26:29AM -0400, Ohad Ben-Cohen wrote: > > With an unsigned long you can use plain and fast bit_ops instead of the > > full bitmap functions. > > Not sure I follow; the only bit operation I'm using while mapping is > find_next_bit() (which is a bitops.h method). > > What other faster variant are you referring to ? You pass a pointer to an unsigned long for the page-size bitmap. This allows to use an array of unsigned long. But a single unsigned long is sufficient and you can use functions like ffs() and fls() together with shifting. These functions often translate to a single intruction in the binary. The find_next_bit function has much more overhead because it needs to handle the array-of-ulong case. Joerg -- AMD Operating System Research Center Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach General Managers: Alberto Bozzo, Andrew Bowd Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] kvm tools: Set active console before running /bin/sh
This patch sets active console to the serial console before running the shell. Doing so fixes two issues: * Fix job control. * Set all env vars. The user visible issues are less warnings (no more of this: sh: cannot set terminal process group (-1): Inappropriate ioctl for device sh: no job control in this shell) A working 'top', and a working ctrl-c. Signed-off-by: Sasha Levin --- tools/kvm/guest/init.c |8 +++- 1 files changed, 7 insertions(+), 1 deletions(-) diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c index 837acfb..4fc6aaf 100644 --- a/tools/kvm/guest/init.c +++ b/tools/kvm/guest/init.c @@ -11,7 +11,7 @@ static int run_process(char *filename) { char *new_argv[] = { filename, NULL }; - char *new_env[] = { NULL }; + char *new_env[] = { "TERM=linux" }; return execve(filename, new_argv, new_env); } @@ -30,6 +30,12 @@ int main(int argc, char *argv[]) do_mounts(); +/* get session leader */ +setsid(); + +/* set controlling terminal */ +ioctl (0, TIOCSCTTY, 1); + puts("Starting '/bin/sh'..."); run_process("/bin/sh"); -- 1.7.6.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/4] acpi: fix up EJ0 in DSDT
On Mon, Sep 26, 2011 at 08:04:24PM -0400, Kevin O'Connor wrote: > On Mon, Sep 26, 2011 at 10:04:13AM +0300, Michael S. Tsirkin wrote: > > On Mon, Sep 26, 2011 at 12:40:18AM -0400, Kevin O'Connor wrote: > > > On Thu, Sep 22, 2011 at 09:09:49AM +0300, Michael S. Tsirkin wrote: > > > > On Thu, Sep 22, 2011 at 12:35:13AM -0400, Kevin O'Connor wrote: > > > > > The code to generate basic SSDT code isn't that difficult (see > > > > > build_ssdt and src/ssdt-proc.dsl). Is there a compelling reason to > > > > > patch the DSDT versus just generating the necessary blocks in an SSDT? > > > > > > > > I don't really care whether the code is in DSDT or SSDT, > > > > IMO there isn't much difference between build_ssdt and patching: > > > > main reason is build_ssdt uses offsets hardcoded to a specific binary > > > > (ssdt_proc and SD_OFFSET_* ) while I used > > > > a script to extract offsets. > > > > > > Yes - your script to extract the offsets is nice. > > > > If you still have doubts, > > it might make sense to merge just patch 1 - > > acpi: generate and parse mixed asl/aml listing > > - as the first step. > > With the infrastructure in place it will be > > easier to discuss the best way to use it. > > I'm okay with your first patch. BTW, any more comments with the rest of the patchset? If you just need to think about it, I understand. > However, I wish to tag a release > before committing ACPI changes. Sure. So you'll take this patchset from here or want me to ping you later? > There was a concern raised with > two-pass PCI initialization that I need to follow up on before > tagging. The isa bridge? I thought that got fixed ... > > -Kevin -- MST -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm tools: Set active console before running /bin/sh
This patch sets active console to the serial console before running the shell. Doing so fixes two issues: * Fix job control. * Set all env vars. The user visible issues are less warnings (no more of this: sh: cannot set terminal process group (-1): Inappropriate ioctl for device sh: no job control in this shell) A working 'top', and a working ctrl-c. Signed-off-by: Sasha Levin --- tools/kvm/guest/init.c |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/tools/kvm/guest/init.c b/tools/kvm/guest/init.c index 837acfb..b55cd30 100644 --- a/tools/kvm/guest/init.c +++ b/tools/kvm/guest/init.c @@ -30,6 +30,12 @@ int main(int argc, char *argv[]) do_mounts(); +/* get session leader */ +setsid(); + +/* set controlling terminal */ +ioctl (0, TIOCSCTTY, 1); + puts("Starting '/bin/sh'..."); run_process("/bin/sh"); -- 1.7.6.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb
On Tue, Sep 27, 2011 at 02:20:29PM +0300, Sasha Levin wrote: > On Tue, 2011-09-27 at 10:00 +0300, Michael S. Tsirkin wrote: > > > skb = page_to_skb(vi, page, len); > > > ... > > > > Sorry I don't get it yet. Where is mergeable ignored here? > > The NULL deref happens in page_to_skb(), before merge buffers are > handled. The len here is a single buffer length, so for mergeable buffers it is <= the size of the buffer we gave to hardware, which is PAGE_SIZE. err = virtqueue_add_buf_single(vi->rvq, page_address(page), PAGE_SIZE, false, page, gfp); Unless of course we are trying to work around broken hardware again, which I don't have a problem with, but should probably get appropriate comments in code and trigger a warning. > I'll test it and see if it's really the case. > > -- > > Sasha. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb
On Tue, 2011-09-27 at 10:00 +0300, Michael S. Tsirkin wrote: > > skb = page_to_skb(vi, page, len); > > ... > > Sorry I don't get it yet. Where is mergeable ignored here? The NULL deref happens in page_to_skb(), before merge buffers are handled. I'll test it and see if it's really the case. -- Sasha. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware
On Tue, Sep 27, 2011 at 1:05 PM, Roedel, Joerg wrote: > On Fri, Sep 16, 2011 at 01:51:41PM -0400, Ohad Ben-Cohen wrote: >> int iommu_map(struct iommu_domain *domain, unsigned long iova, >> - phys_addr_t paddr, int gfp_order, int prot) >> + phys_addr_t paddr, size_t size, int prot) >> { >> - size_t size; >> + int ret = 0; >> + >> + /* >> + * both the virtual address and the physical one, as well as >> + * the size of the mapping, must be aligned (at least) to the >> + * size of the smallest page supported by the hardware >> + */ >> + if (!IS_ALIGNED(iova | paddr | size, iommu_min_pagesz)) { >> + pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz >> " >> + "0x%x\n", iova, (unsigned long)paddr, >> + (unsigned long)size, iommu_min_pagesz); >> + return -EINVAL; >> + } >> + >> + pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova, >> + (unsigned long)paddr, (unsigned long)size); >> + >> + while (size) { >> + unsigned long pgsize = iommu_min_pagesz; >> + unsigned long idx = iommu_min_page_idx; >> + unsigned long addr_merge = iova | paddr; >> + int order; >> + >> + /* find the max page size with which iova, paddr are aligned >> */ >> + for (;;) { >> + unsigned long try_pgsize; >> + >> + idx = find_next_bit(iommu_pgsize_bitmap, >> + iommu_nr_page_bits, idx + 1); >> + >> + /* no more pages to check ? */ >> + if (idx >= iommu_nr_page_bits) >> + break; >> + >> + try_pgsize = 1 << idx; >> >> - size = 0x1000UL << gfp_order; >> + /* page too big ? addresses not aligned ? */ >> + if (size < try_pgsize || >> + !IS_ALIGNED(addr_merge, try_pgsize)) >> + break; >> >> - BUG_ON(!IS_ALIGNED(iova | paddr, size)); >> + pgsize = try_pgsize; >> + } > > With an unsigned long you can use plain and fast bit_ops instead of the > full bitmap functions. Not sure I follow; the only bit operation I'm using while mapping is find_next_bit() (which is a bitops.h method). What other faster variant are you referring to ? Thanks, Ohad. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
kvm linux-next tree
Please add git://github.com/avikivity/kvm.git kvm-updates/3.2 to the linux-next collection, until kernel.org is back up. Thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/6] iommu/core: split mapping to page sizes as supported by the hardware
On Fri, Sep 16, 2011 at 01:51:41PM -0400, Ohad Ben-Cohen wrote: > drivers/iommu/iommu.c | 158 > +--- > drivers/iommu/omap-iovmm.c | 12 +--- > include/linux/iommu.h |6 +- > virt/kvm/iommu.c |4 +- > 4 files changed, 157 insertions(+), 23 deletions(-) > > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c > index c68ff29..7c01c8c 100644 > --- a/drivers/iommu/iommu.c > +++ b/drivers/iommu/iommu.c > @@ -16,6 +16,8 @@ > * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA > */ > > +#define pr_fmt(fmt)"%s: " fmt, __func__ > + > #include > #include > #include > @@ -23,15 +25,73 @@ > #include > #include > #include > +#include > > static struct iommu_ops *iommu_ops; > > +/* bitmap of supported page sizes */ > +static unsigned long *iommu_pgsize_bitmap; > + > +/* number of bits used to represent the supported pages */ > +static unsigned int iommu_nr_page_bits; > + > +/* size of the smallest supported page (in bytes) */ > +static unsigned int iommu_min_pagesz; > + > +/* bit number of the smallest supported page */ > +static unsigned int iommu_min_page_idx; > + > +/** > + * register_iommu() - register an IOMMU hardware > + * @ops: iommu handlers > + * @pgsize_bitmap: bitmap of page sizes supported by the hardware > + * @nr_page_bits: size of @pgsize_bitmap (in bits) > + * > + * Note: this is a temporary function, which will be removed once > + * all IOMMU drivers are converted. The only reason it exists is to > + * allow splitting the pgsizes changes to several patches in order to ease > + * the review. > + */ > +void register_iommu_pgsize(struct iommu_ops *ops, unsigned long > *pgsize_bitmap, > + unsigned int nr_page_bits) I think a plain unsigned long value is sufficient to encode the supported page-sizes. No need to use a full bitmap. > int iommu_map(struct iommu_domain *domain, unsigned long iova, > - phys_addr_t paddr, int gfp_order, int prot) > + phys_addr_t paddr, size_t size, int prot) > { > - size_t size; > + int ret = 0; > + > + /* > +* both the virtual address and the physical one, as well as > +* the size of the mapping, must be aligned (at least) to the > +* size of the smallest page supported by the hardware > +*/ > + if (!IS_ALIGNED(iova | paddr | size, iommu_min_pagesz)) { > + pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz " > + "0x%x\n", iova, (unsigned long)paddr, > + (unsigned long)size, iommu_min_pagesz); > + return -EINVAL; > + } > + > + pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova, > + (unsigned long)paddr, (unsigned long)size); > + > + while (size) { > + unsigned long pgsize = iommu_min_pagesz; > + unsigned long idx = iommu_min_page_idx; > + unsigned long addr_merge = iova | paddr; > + int order; > + > + /* find the max page size with which iova, paddr are aligned > */ > + for (;;) { > + unsigned long try_pgsize; > + > + idx = find_next_bit(iommu_pgsize_bitmap, > + iommu_nr_page_bits, idx + 1); > + > + /* no more pages to check ? */ > + if (idx >= iommu_nr_page_bits) > + break; > + > + try_pgsize = 1 << idx; > > - size = 0x1000UL << gfp_order; > + /* page too big ? addresses not aligned ? */ > + if (size < try_pgsize || > + !IS_ALIGNED(addr_merge, try_pgsize)) > + break; > > - BUG_ON(!IS_ALIGNED(iova | paddr, size)); > + pgsize = try_pgsize; > + } With an unsigned long you can use plain and fast bit_ops instead of the full bitmap functions. > > - return iommu_ops->map(domain, iova, paddr, gfp_order, prot); > + order = get_order(pgsize); > + > + pr_debug("mapping: iova 0x%lx pa 0x%lx order %d\n", iova, > + (unsigned long)paddr, order); > + > + ret = iommu_ops->map(domain, iova, paddr, order, prot); > + if (ret) > + break; > + > + size -= pgsize; > + iova += pgsize; > + paddr += pgsize; > + } > + > + return ret; > } > EXPORT_SYMBOL_GPL(iommu_map); > -- AMD Operating System Research Center Advanced Micro Devices GmbH Einsteinring 24 85609 Dornach General Managers: Alberto Bozzo, Andrew Bowd Registration: Dornach, Landkr. Muenchen; Registerger. Muenchen, HRB Nr. 43632 -- To unsubscribe from t
Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work
On 09/27/2011 12:00 PM, Robin Lee Powell wrote: On Tue, Sep 27, 2011 at 01:48:43AM -0700, Robin Lee Powell wrote: > On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote: > > On 9/27/11, Robin Lee Powell wrote: > > > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin > > > wrote: > > >> It's unrelated to what you're actually using as the disks, > > >> whether file or block devices like LVs. I think it just makes > > >> KVM tell the host not to cache I/O done on the storage device. > > > > > > Wait, hold on, I think I had it backwards. > > > > > > It tells the *host* to not cache the device in question, or the > > > *VMs* to not cache the device in question? > > > > I'm fairly certain it tells the qemu not to cache the device in > > question. If you don't want the guest to cache their i/o, then the > > guest OS should be configured if it allows that. Although I'm not > > sure if it's possible to disable disk buffering/caching system > > wide in Linux. > > OK, great, thanks. > > Now if I could just figure out how to stop the host from swapping > out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM > left. -_- swappiness 0 doesn't seem to help there. Grrr. I turned swap off to clear it. A few minutes ago, this host was at zero swap: top - 01:59:10 up 10 days, 15:17, 3 users, load average: 6.39, 4.26, 3.24 Tasks: 151 total, 1 running, 150 sleeping, 0 stopped, 0 zombie Cpu(s): 6.6%us, 1.0%sy, 0.0%ni, 85.9%id, 6.3%wa, 0.0%hi, 0.2%si, 0.0%st Mem: 8128772k total, 656k used, 1617656k free,14800k buffers Swap: 8388604k total, 672828k used, 7715776k free,97536k cached PID USER PR NI VIRT RES SHR S %CPU %MEMTIME+ COMMAND 2504 qemu 20 0 2425m 1.8g 448 S 10.0 23.4 3547:59 qemu-kvm 2258 qemu 20 0 2425m 1.7g 444 S 2.7 21.7 1288:15 qemu-kvm 18061 qemu 20 0 2433m 1.8g 428 S 2.3 23.4 401:01.99 qemu-kvm 10335 qemu 20 0 1864m 861m 456 S 1.0 10.9 2:04.26 qemu-kvm [snip] Why is it doing this?!? ;'( Please post the contents of /proc/meminfo and /proc/zoneinfo when this is happening. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [PATCH 00/10] [PATCH RFC V2] Paravirtualized ticketlocks
On Wednesday 14 September 2011, 17:31:32 Jeremy Fitzhardinge wrote: > This series replaces the existing paravirtualized spinlock mechanism > with a paravirtualized ticketlock mechanism. [...] > The unlock code is very straightforward: > prev = *lock; > __ticket_unlock_release(lock); > if (unlikely(__ticket_in_slowpath(lock))) > __ticket_unlock_slowpath(lock, prev); > > which generates: > push %rbp > mov%rsp,%rbp > > movzwl (%rdi),%esi > addb $0x2,(%rdi) > movzwl (%rdi),%eax > testb $0x1,%ah > jne1f > > pop%rbp > retq > > ### SLOWPATH START > 1:movzwl (%rdi),%edx > movzbl %dh,%ecx > mov%edx,%eax > and$-2,%ecx # clear TICKET_SLOWPATH_FLAG > mov%cl,%dh > cmp%dl,%cl # test to see if lock is uncontended > je 3f > > 2:movzbl %dl,%esi > callq *__ticket_unlock_kick# kick anyone waiting > pop%rbp > retq > > 3:lock cmpxchg %dx,(%rdi) # use cmpxchg to safely write back flag > jmp2b > ### SLOWPATH END [...] > Thoughts? Comments? Suggestions? You have a nasty data race in your code that can cause a losing acquirer to sleep forever, because its setting the TICKET_SLOWPATH flag can race with the lock holder releasing the lock. I used the code for the slow path from the GIT repo. Let me try to point out an interleaving: Lock is held by one thread, contains 0x0200. _Lock holder_ _Acquirer_ mov$0x200,%eax lock xadd %ax,(%rdi) // ax:= 0x0200, lock:= 0x0400 ... // this guy spins for a while, reading // the lock ... //trying to free the lock movzwl (%rdi),%esi (esi:=0x0400) addb $0x2,(%rdi) (LOCAL copy of lock is now: 0x0402) movzwl (%rdi),%eax (local forwarding from previous store: eax := 0x0402) testb $0x1,%ah(no wakeup of anybody) jne1f callq *__ticket_lock_spinning ... // __ticket_enter_slowpath(lock) lock or (%rdi), $0x100 // (global view of lock := 0x0500) ... ACCESS_ONCE(lock->tickets.head) == want // (reads 0x00) ... xen_poll_irq(irq); // goes to sleep ... [addb $0x2,(%rdi)] // (becomes globally visible only now! global view of lock := 0x0502) ... Your code is reusing the (just about) safe version of unlocking a spinlock without understanding the effect that close has on later memory ordering. It may work on CPUs that cannot do narrow -> wide store to load forwarding and have to make the addb store visible globally. This is an implementation artifact of specific uarches, and you mustn't rely on it, since our specified memory model allows looser behaviour. Since you want to get that addb out to global memory before the second read, either use a LOCK prefix for it, add an MFENCE between addb and movzwl, or use a LOCKed instruction that will have a fencing effect (e.g., to top-of-stack)between addb and movzwl. Stephan -- Stephan Diestelhorst, AMD Operating System Research Center stephan.diestelho...@amd.com Tel. +49 (0)351 448 356 719 Advanced Micro Devices GmbH Einsteinring 24 85609 Aschheim Germany Geschaeftsfuehrer: Alberto Bozzo Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen Registergericht Muenchen, HRB Nr. 43632, WEEE-Reg-Nr: DE 12919551 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: LPC2011 Virtualization Micro Conf
Hi, For those who are interested, I have posted the notes from the 2011 Linux Plumbers Virtualization micro conference here: http://wiki.linuxplumbersconf.org/2011:virtualization Slides can be found by clicking on the presentation and going onto the Plumbers abstracts. Cheers, Jes -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] qemu-kvm: Switch POSIX compat AIO implementation to upstream
On 09/26/2011 09:09 PM, Anthony Liguori wrote: On 09/26/2011 12:24 PM, Avi Kivity wrote: On 09/26/2011 08:23 PM, Jan Kiszka wrote: > > Perhaps qemu_eventfd() can be used in the future instead of an explicit > pipe. Then Linux will do eventfd while other OSes will fall back to > pipes. Basically simpler code, or does this also have runtime benefits? In corner cases, the completion can block on the write() with pipes, but not eventfd. The pipe is O_NONBLOCK and the only thing the caller cares about is whether there is *some* data in the PIPE so it can happily ignore EAGAIN. So the pipe implementation never blocks on write() either. It may require multiple reads to drain the queue though whereas eventfd would only require a single read to drain the queue. Oh, so switching to eventfd won't change much. Still nice though IMO. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work
On Tue, Sep 27, 2011 at 01:48:43AM -0700, Robin Lee Powell wrote: > On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote: > > On 9/27/11, Robin Lee Powell wrote: > > > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin > > > wrote: > > >> It's unrelated to what you're actually using as the disks, > > >> whether file or block devices like LVs. I think it just makes > > >> KVM tell the host not to cache I/O done on the storage device. > > > > > > Wait, hold on, I think I had it backwards. > > > > > > It tells the *host* to not cache the device in question, or the > > > *VMs* to not cache the device in question? > > > > I'm fairly certain it tells the qemu not to cache the device in > > question. If you don't want the guest to cache their i/o, then the > > guest OS should be configured if it allows that. Although I'm not > > sure if it's possible to disable disk buffering/caching system > > wide in Linux. > > OK, great, thanks. > > Now if I could just figure out how to stop the host from swapping > out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM > left. -_- swappiness 0 doesn't seem to help there. Grrr. I turned swap off to clear it. A few minutes ago, this host was at zero swap: top - 01:59:10 up 10 days, 15:17, 3 users, load average: 6.39, 4.26, 3.24 Tasks: 151 total, 1 running, 150 sleeping, 0 stopped, 0 zombie Cpu(s): 6.6%us, 1.0%sy, 0.0%ni, 85.9%id, 6.3%wa, 0.0%hi, 0.2%si, 0.0%st Mem: 8128772k total, 656k used, 1617656k free,14800k buffers Swap: 8388604k total, 672828k used, 7715776k free,97536k cached PID USER PR NI VIRT RES SHR S %CPU %MEMTIME+ COMMAND 2504 qemu 20 0 2425m 1.8g 448 S 10.0 23.4 3547:59 qemu-kvm 2258 qemu 20 0 2425m 1.7g 444 S 2.7 21.7 1288:15 qemu-kvm 18061 qemu 20 0 2433m 1.8g 428 S 2.3 23.4 401:01.99 qemu-kvm 10335 qemu 20 0 1864m 861m 456 S 1.0 10.9 2:04.26 qemu-kvm [snip] Why is it doing this?!? ;'( (I don't know if anyone really has an answer, just wanted to rant) -Robin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work
On Tue, Sep 27, 2011 at 04:41:33PM +0800, Emmanuel Noobadmin wrote: > On 9/27/11, Robin Lee Powell wrote: > > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin > > wrote: > >> It's unrelated to what you're actually using as the disks, > >> whether file or block devices like LVs. I think it just makes > >> KVM tell the host not to cache I/O done on the storage device. > > > > Wait, hold on, I think I had it backwards. > > > > It tells the *host* to not cache the device in question, or the > > *VMs* to not cache the device in question? > > I'm fairly certain it tells the qemu not to cache the device in > question. If you don't want the guest to cache their i/o, then the > guest OS should be configured if it allows that. Although I'm not > sure if it's possible to disable disk buffering/caching system > wide in Linux. OK, great, thanks. Now if I could just figure out how to stop the host from swapping out much of the VMs' qemu-kvm procs when it has almost a GiB of RAM left. -_- swappiness 0 doesn't seem to help there. -Robin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work
On 9/27/11, Robin Lee Powell wrote: > On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin wrote: >> It's unrelated to what you're actually using as the disks, whether >> file or block devices like LVs. I think it just makes KVM tell the >> host not to cache I/O done on the storage device. > > Wait, hold on, I think I had it backwards. > > It tells the *host* to not cache the device in question, or the > *VMs* to not cache the device in question? I'm fairly certain it tells the qemu not to cache the device in question. If you don't want the guest to cache their i/o, then the guest OS should be configured if it allows that. Although I'm not sure if it's possible to disable disk buffering/caching system wide in Linux. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvm] Re: [kvm] Re: Questions about duplicate memory work
On Mon, Sep 26, 2011 at 04:15:37PM +0800, Emmanuel Noobadmin wrote: > On 9/26/11, Robin Lee Powell wrote: > > On Mon, Sep 26, 2011 at 01:49:18PM +0800, Emmanuel Noobadmin wrote: > >> On 9/25/11, Robin Lee Powell wrote: > >> > > >> > OK, so I've got a Linux host, and a bunch of Linux VMs. > >> > > >> > This means that the host *and* all tho VMs do their own disk > >> > caches/buffers and do their own swap as well. > >> > >> If I'm not wrong, that's why the recommended and current default > >> in libvirtd is to create storage devices with no caching to remove > >> one layer of duplication. > > > > How do you do that? I have my VMs using LVs created on the host as > > their disks, but I'm open to other methods if there are significant > > advantages. > > It's unrelated to what you're actually using as the disks, whether > file or block devices like LVs. I think it just makes KVM tell the > host not to cache I/O done on the storage device. Wait, hold on, I think I had it backwards. It tells the *host* to not cache the device in question, or the *VMs* to not cache the device in question? -Robin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] kvm tools: Support multiple net devices
* Sasha Levin wrote: > --- a/tools/kvm/builtin-run.c > +++ b/tools/kvm/builtin-run.c > @@ -87,9 +87,12 @@ static bool sdl; > static bool balloon; > static bool using_rootfs; > static bool custom_rootfs; > +static bool no_net; > extern bool ioport_debug; > extern int active_console; > extern int debug_iodelay; > +struct virtio_net_parameters *net_params; > +int num_net_devices; Just a stylistic nit-pick: this variable definition section looks pretty ugly. Those externs should be in a header and in any case different types of lines should generally not be mixed without at least a newline between them. > +static int set_net_param(struct virtio_net_parameters *p, const char *param, > + const char *val) Naming nit: 'struct virtio_net_params' is shorter by 4 chars and just as expressive. > + char *buf, *cmd = NULL, *cur = NULL; > + bool on_cmd = true; > + > + if (arg) { > + buf = strdup(arg); > + if (buf == NULL) > + die("Failed allocating new net buffer"); > + cur = strtok(buf, ",="); > + } > + > + p = (struct virtio_net_parameters) { > + .guest_ip = DEFAULT_GUEST_ADDR, > + .host_ip= DEFAULT_HOST_ADDR, > + .script = DEFAULT_SCRIPT, > + .mode = NET_MODE_TAP, > + }; > + > + str_to_mac(DEFAULT_GUEST_MAC, p.guest_mac); > + p.guest_mac[5] += num_net_devices; > + > + while (cur) { > + if (on_cmd) { > + cmd = cur; > + } else { > + if (set_net_param(&p, cmd, cur) < 0) > + goto done; > + } > + on_cmd = !on_cmd; > + > + cur = strtok(NULL, ",="); > + }; > + > + num_net_devices++; > + > + net_params = realloc(net_params, num_net_devices * sizeof(*net_params)); > + if (net_params == NULL) > + die("Failed adding new network device"); > + > + net_params[num_net_devices - 1] = p; > + > +done: > + return 0; > +} Isn't 'buf' leaked here? Patch looks good otherwise. Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] virtio-net: Verify page list size before fitting into skb
On Tue, Sep 27, 2011 at 09:44:02AM +0300, Sasha Levin wrote: > On Mon, 2011-09-26 at 22:55 +0300, Michael S. Tsirkin wrote: > > On Mon, Sep 26, 2011 at 10:37:22PM +0300, Sasha Levin wrote: > > > On Mon, 2011-09-26 at 21:44 +0300, Michael S. Tsirkin wrote: > > > > On Mon, Sep 26, 2011 at 08:41:08PM +0300, Sasha Levin wrote: > > > > > This patch verifies that the length of a buffer stored in a linked > > > > > list > > > > > of pages is small enough to fit into a skb. > > > > > > > > > > If the size is larger than a max size of a skb, it means that we > > > > > shouldn't > > > > > go ahead building skbs anyway since we won't be able to send the > > > > > buffer as > > > > > the user requested. > > > > > > > > > > Cc: Rusty Russell > > > > > Cc: "Michael S. Tsirkin" > > > > > Cc: virtualizat...@lists.linux-foundation.org > > > > > Cc: net...@vger.kernel.org > > > > > Cc: kvm@vger.kernel.org > > > > > Signed-off-by: Sasha Levin > > > > > > > > Interesting. This is a theoretical issue, correct? > > > > Not a crash you actually see. > > > > > > Actually it was an actual crash caused when our virtio-net driver in kvm > > > tools did funny things and passed '(u32)-1' length as a buffer length to > > > the guest kernel. > > > > > > > This crash would mean device is giving us packets > > > > that are way too large. Avoiding crashes even in the face of > > > > a misbehaved device is a good idea, but should > > > > we print a diagnostic to a system log? > > > > Maybe rate-limited or print once to avoid filling > > > > up the disk. Other places in driver print with pr_debug > > > > I'm not sure that's right but better than nothing. > > > > > > Yup, I'll add some debug info. > > > > > > > > --- > > > > > drivers/net/virtio_net.c |3 +++ > > > > > 1 files changed, 3 insertions(+), 0 deletions(-) > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c > > > > > index 0c7321c..64e0717 100644 > > > > > --- a/drivers/net/virtio_net.c > > > > > +++ b/drivers/net/virtio_net.c > > > > > @@ -165,6 +165,9 @@ static struct sk_buff *page_to_skb(struct > > > > > virtnet_info *vi, > > > > > unsigned int copy, hdr_len, offset; > > > > > char *p; > > > > > > > > > > + if (len > MAX_SKB_FRAGS * PAGE_SIZE) > > > > > > > > unlikely()? > > > > > > > > Also, this seems too aggressive: at this point len includes the header > > > > and the linear part. The right place for this > > > > test is probably where we fill in the frags, just before > > > > while (len) > > > > > > > > The whole can only happen when mergeable buffers > > > > are disabled, right? > > > > > > >From what I understand it can happen whenever you're going to build a > > > skb longer than PAGE_SIZE. > > > > Hmm how exactly? With mergeable buffers this only gets > > the length of the 1st chunk which is up to 4K unless the driver > > is buggy ... > > What about the case where TSO or ECN features are set? The code flow > suggests that mergeable would get ignored in that case: > > /* If we can receive ANY GSO packets, we must allocate large ones. */ > if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || > virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || > virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN)) > vi->big_packets = true; > > [...] > > if (!vi->mergeable_rx_bufs && !vi->big_packets) { > skb = buf; > len -= sizeof(struct virtio_net_hdr); > skb_trim(skb, len); > } else { > page = buf; > skb = page_to_skb(vi, page, len); > ... Sorry I don't get it yet. Where is mergeable ignored here? > I haven't actually tested it with mergeable buffers enabled, but could > do it later today. Please do. > -- > > Sasha. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html