[libvirt] [PATCH] Add nwfilter support to UML driver
Extend user-mode-linux driver to support nwfilter. Signed-off-by: Soren Hansen so...@linux2go.dk --- src/uml/uml_conf.c | 16 +--- src/uml/uml_driver.c |8 +++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/uml/uml_conf.c b/src/uml/uml_conf.c index 4906192..f2eaef5 100644 --- a/src/uml/uml_conf.c +++ b/src/uml/uml_conf.c @@ -46,6 +46,7 @@ #include verify.h #include bridge.h #include logging.h +#include domain_nwfilter.h #define VIR_FROM_THIS VIR_FROM_UML @@ -108,7 +109,8 @@ virCapsPtr umlCapsInit(void) { static int -umlConnectTapDevice(virDomainNetDefPtr net, +umlConnectTapDevice(virConnectPtr conn, +virDomainNetDefPtr net, const char *bridge) { brControl *brctl = NULL; @@ -164,6 +166,14 @@ umlConnectTapDevice(virDomainNetDefPtr net, goto error; } +if (net-filter) { +if (virDomainConfNWFilterInstantiate(conn, net)) { +if (template_ifname) +VIR_FREE(net-ifname); +goto error; +} +} + brShutdown(brctl); return 0; @@ -239,7 +249,7 @@ umlBuildCommandLineNet(virConnectPtr conn, goto error; } -if (umlConnectTapDevice(def, bridge) 0) { +if (umlConnectTapDevice(conn, def, bridge) 0) { VIR_FREE(bridge); goto error; } @@ -250,7 +260,7 @@ umlBuildCommandLineNet(virConnectPtr conn, } case VIR_DOMAIN_NET_TYPE_BRIDGE: -if (umlConnectTapDevice(def, def-data.bridge.brname) 0) +if (umlConnectTapDevice(conn, def, def-data.bridge.brname) 0) goto error; /* ethNNN=tuntap,tapname,macaddr,gateway */ diff --git a/src/uml/uml_driver.c b/src/uml/uml_driver.c index 0a5c829..40345d5 100644 --- a/src/uml/uml_driver.c +++ b/src/uml/uml_driver.c @@ -58,6 +58,7 @@ #include domain_conf.h #include datatypes.h #include logging.h +#include domain_nwfilter.h #define VIR_FROM_THIS VIR_FROM_UML @@ -876,6 +877,7 @@ static int umlStartVMDaemon(virConnectPtr conn, if (umlBuildCommandLine(conn, driver, vm, keepfd, argv, progenv) 0) { close(logfd); +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); return -1; } @@ -928,8 +930,11 @@ static int umlStartVMDaemon(virConnectPtr conn, VIR_FREE(progenv[i]); VIR_FREE(progenv); -if (ret 0) +if (ret 0) { +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); +} + /* NB we don't mark it running here - we do that async with inotify */ @@ -965,6 +970,7 @@ static void umlShutdownVMDaemon(virConnectPtr conn ATTRIBUTE_UNUSED, vm-def-id = -1; vm-state = VIR_DOMAIN_SHUTOFF; +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); if (vm-newDef) { -- 1.7.1 -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] build: Fix permissions of sysconfig files
On Tue, Sep 07, 2010 at 10:04:37AM +0200, Jiri Denemark wrote: --- daemon/Makefile.am |2 +- libvirt.spec.in|4 tools/Makefile.am |2 +- 3 files changed, 2 insertions(+), 6 deletions(-) Okay, better fix this in the makefiles than the spec file ! ACK Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ dan...@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/ -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] Add nwfilter support to UML driver
On Tue, Sep 07, 2010 at 10:19:56AM +0200, Soren Hansen wrote: Extend user-mode-linux driver to support nwfilter. Signed-off-by: Soren Hansen so...@linux2go.dk --- src/uml/uml_conf.c | 16 +--- src/uml/uml_driver.c |8 +++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/uml/uml_conf.c b/src/uml/uml_conf.c index 4906192..f2eaef5 100644 --- a/src/uml/uml_conf.c +++ b/src/uml/uml_conf.c @@ -46,6 +46,7 @@ #include verify.h #include bridge.h #include logging.h +#include domain_nwfilter.h #define VIR_FROM_THIS VIR_FROM_UML @@ -108,7 +109,8 @@ virCapsPtr umlCapsInit(void) { static int -umlConnectTapDevice(virDomainNetDefPtr net, +umlConnectTapDevice(virConnectPtr conn, +virDomainNetDefPtr net, const char *bridge) { brControl *brctl = NULL; @@ -164,6 +166,14 @@ umlConnectTapDevice(virDomainNetDefPtr net, goto error; } +if (net-filter) { +if (virDomainConfNWFilterInstantiate(conn, net)) { +if (template_ifname) +VIR_FREE(net-ifname); +goto error; +} +} + brShutdown(brctl); return 0; @@ -239,7 +249,7 @@ umlBuildCommandLineNet(virConnectPtr conn, goto error; } -if (umlConnectTapDevice(def, bridge) 0) { +if (umlConnectTapDevice(conn, def, bridge) 0) { VIR_FREE(bridge); goto error; } @@ -250,7 +260,7 @@ umlBuildCommandLineNet(virConnectPtr conn, } case VIR_DOMAIN_NET_TYPE_BRIDGE: -if (umlConnectTapDevice(def, def-data.bridge.brname) 0) +if (umlConnectTapDevice(conn, def, def-data.bridge.brname) 0) goto error; /* ethNNN=tuntap,tapname,macaddr,gateway */ diff --git a/src/uml/uml_driver.c b/src/uml/uml_driver.c index 0a5c829..40345d5 100644 --- a/src/uml/uml_driver.c +++ b/src/uml/uml_driver.c @@ -58,6 +58,7 @@ #include domain_conf.h #include datatypes.h #include logging.h +#include domain_nwfilter.h #define VIR_FROM_THIS VIR_FROM_UML @@ -876,6 +877,7 @@ static int umlStartVMDaemon(virConnectPtr conn, if (umlBuildCommandLine(conn, driver, vm, keepfd, argv, progenv) 0) { close(logfd); +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); return -1; } @@ -928,8 +930,11 @@ static int umlStartVMDaemon(virConnectPtr conn, VIR_FREE(progenv[i]); VIR_FREE(progenv); -if (ret 0) +if (ret 0) { +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); +} + /* NB we don't mark it running here - we do that async with inotify */ @@ -965,6 +970,7 @@ static void umlShutdownVMDaemon(virConnectPtr conn ATTRIBUTE_UNUSED, vm-def-id = -1; vm-state = VIR_DOMAIN_SHUTOFF; +virDomainConfVMNWFilterTeardown(vm); umlCleanupTapDevices(conn, vm); if (vm-newDef) { We are supposed to be in feature freeze mode this week, but this looks simple enough and I don't think this can introduce regression, so ACK and I suggest to push this to git before Friday, unless someone disagrees, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ dan...@veillard.com | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/ -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] build: Fix permissions of sysconfig files
daemon/Makefile.am |2 +- libvirt.spec.in|4 tools/Makefile.am |2 +- 3 files changed, 2 insertions(+), 6 deletions(-) Okay, better fix this in the makefiles than the spec file ! ACK Thanks, pushed. Jirka -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] Add nwfilter support to UML driver
On 07-09-2010 10:32, Daniel Veillard wrote: We are supposed to be in feature freeze mode this week, Apologies. I didn't realise. Where could I have learned this? -- Soren Hansen Ubuntu Developer http://www.ubuntu.com/ -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] Question, how to use virDomainQemuMonitorCommand()
On 09/07/10 - 04:08:13PM, Lai Jiangshan wrote: Hi, Chris, I saw virDomainQemuMonitorCommand() in libvirt-qemu.c, I think it will help me to send arbitrary qemu-monitor command to qemu via libvirtd. But how can I use virDomainQemuMonitorCommand()? Can I use it by just using current tools(virsh or other) without writing any code? Unfortunately, no. There is a bug in the current virsh command that prevents it from properly parsing the command-lines necessary to send monitor commands to the qemu monitor. Until we fix that bug, we won't push the support into virsh. For that reason you will need to write a custom program to call virDomainQemuMonitorCommand as appropriate. The absolute easiest program you can write looks something like (untested): #include stdio.h #include stdlib.h #include libvirt/libvirt.h int main() { virConnectPtr conn; virDomainPtr dom; char *reply; conn = virConnectOpen(NULL); dom = virDomainLookupByName(conn, mydomain); virDomainQemuMonitorCommand(dom, info cpus, reply, 0); fprintf(stderr, Reply: %s\n, reply); free(reply); virConnectClose(conn); return 0; } -- Chris Lalancette -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] ruby-libvirt / rubygems
On 09/03/10 - 09:23:48PM, Roland Moriz wrote: Hello Chris, could you publish the updated ruby-libvirt bindings to rubygems.org? There is still only version 0.1.0 of 2008 available: http://rubygems.org/gems/ruby-libvirt Thank you! Hello, I'd be happy to update the gem on rubygems.org. Unfortunately I don't have permission to do so, so I can't right at the moment. I've written to the previous maintainer of ruby-libvirt to see if he can give me access to update the gems there. As soon as I get that access, I'll push a new version of the gem and let you know. Thanks, -- Chris Lalancette -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
[libvirt] QEMU interfaces for image streaming and post-copy block migration
Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: stream device sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source This is a bit involved and we could potentially automate some of this in qemu by launching qemu-nbd and providing commands to do some of this. Again though, I think the question is what type of interfaces would libvirt prefer? Low level interfaces + recipes on how to do high level things or higher level interfaces? Regards, Anthony Liguori -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 09:01 AM, Alexander Graf wrote: I'm torn here too. Why not expose both? Have a qemu internal daemon available that gets a sleep time as parameter and an external pull sectors command. We'll see which one is more useful, but I don't think it's too much code to justify only having one of the two. And the internal daemon could be started using a command line parameter, which helps non-managed users. Let me turn it around and ask, how would libvirt do this? Would they just use a sleep time parameter and just make use of our command or would they do something more clever and attempt to detect system idle? Could we just do that in qemu? Or would they punt to the end user? A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source This is a bit involved and we could potentially automate some of this in qemu by launching qemu-nbd and providing commands to do some of this. Again though, I think the question is what type of interfaces would libvirt prefer? Low level interfaces + recipes on how to do high level things or higher level interfaces? Is there anything keeping us from making the QMP socket multiplexable? I was thinking of something like: { command = nbd_server ; block = qemu_block_name } { result = done } qmp socket turns into nbd socket This way we don't require yet another port, don't have to care about conflicts and get internal qemu block names for free. Possibly, but something that complicates life here is that an nbd session would be source - destination but there's no QMP session between source - destination. Instead, there's a session from source - management node and destination - management node so you'd have to proxy nbd traffic between the two. That gets ugly quick. Regards, Anthony Liguori Alex -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 09:34 AM, Kevin Wolf wrote: Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. The way it's implemented in QED is that it's a compatible feature. This means that implementations are allowed to ignore it if they want to. It's really a suggestion. So yes, you could have a run time switch that overrides the feature bit on disk and either forces copy-on-read on or off. Do we have a way to pass block drivers run time options? Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. To do it efficiently, it really needs to be in the format for the same reason that copy-on-write is part of the format. You need to understand the cluster boundaries in order to optimize the metadata updates. Sure, you can expose interfaces to the block layer to give all of this info but that's solving the same problem for doing block level copy-on-write. The other challenge is that for copy-on-read to be efficiently, you really need a format that can distinguish between unallocated sectors and zero sectors and do zero detection during the copy-on-read operation. Otherwise, if you have a 10G virtual disk with a backing file that's 1GB is size, copy-on-read will result in the leaf being 10G instead of ~1GB. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: streamdevice sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. I think libvirt shouldn't have to care about sector offsets. You should just tell qemu to fetch the image and it should do so. We could have something like -drive backing_mode=[cow|cor|stream]. This interface let's libvirt decide when the I/O system is idle. The sector is really just a token to keep track of our overall progress. One thing I envisioned was that a tool like virt-manager could have a progress bar showing the streaming progress. It could update the progress bar based on (offset * 512) / image_size. If libvirt isn't driving it, we need to detect idle I/O time and we need to provide an interface to query status. Not a huge problem but I'm not sure that a single QEMU instance can properly detect idle I/O time. Regards, Anthony Liguori A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source Hm, that's an interesting idea. :-) Kevin -- libvir-list mailing list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 09:33 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 2:41 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: streamdevice sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A self-tuning solution is attractive because it reduces the need for other components (management stack) or the user to get involved. In this case self-tuning should be possible. We need to detect periods of I/O inactivity, for example tracking the number of in-flight requests and then setting a grace timer when it reaches zero. When the grace timer expires, we start streaming until the guest initiates I/O again. That detects idle I/O within a single QEMU guest, but you might have another guest running that's I/O bound which means that from an overall system throughput perspective, you really don't want to stream. I think libvirt might be able to do a better job here by looking at overall system I/O usage. But I'm not sure hence this RFC :-) Regards, Anthony Liguori Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 09:49 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:34 PM, Kevin Wolfkw...@redhat.com wrote: Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. I agree that streaming should be generic, like block migration. The trivial generic implementation is: void bdrv_stream(BlockDriverState* bs) { for (sector = 0; sector bdrv_getlength(bs); sector += n) { if (!bdrv_is_allocated(bs, sector,n)) { Three problems here. First problem is that bdrv_is_allocated is synchronous. The second problem is that streaming makes the most sense when it's the smallest useful piece of work whereas bdrv_is_allocated() may return a very large range. You could cap it here but you then need to make sure that cap is at least cluster_size to avoid a lot of unnecessary I/O. The QED streaming implementation is 140 LOCs too so you quickly end up adding more code to the block formats to support these new interfaces than it takes to just implement it in the block format. Third problem is that streaming really requires being able to do zero write detection in a meaningful way. You don't want to always do zero write detection so you need another interface to mark a specific write as a write that should be checked for zeros. Regards, Anthony Liguori -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 7, 2010 at 2:41 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: stream device sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A self-tuning solution is attractive because it reduces the need for other components (management stack) or the user to get involved. In this case self-tuning should be possible. We need to detect periods of I/O inactivity, for example tracking the number of in-flight requests and then setting a grace timer when it reaches zero. When the grace timer expires, we start streaming until the guest initiates I/O again. Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 7, 2010 at 3:34 PM, Kevin Wolf kw...@redhat.com wrote: Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. I agree that streaming should be generic, like block migration. The trivial generic implementation is: void bdrv_stream(BlockDriverState* bs) { for (sector = 0; sector bdrv_getlength(bs); sector += n) { if (!bdrv_is_allocated(bs, sector, n)) { bdrv_read(bs, sector, ...); bdrv_write(bs, sector, ...); } } } Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: stream device sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. I think libvirt shouldn't have to care about sector offsets. You should just tell qemu to fetch the image and it should do so. We could have something like -drive backing_mode=[cow|cor|stream]. A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source Hm, that's an interesting idea. :-) Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: stream device sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. I think libvirt shouldn't have to care about sector offsets. You should just tell qemu to fetch the image and it should do so. We could have something like -drive backing_mode=[cow|cor|stream]. A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source Hm, that's an interesting idea. :-) Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 7, 2010 at 3:51 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:33 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 2:41 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: streamdevice sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A self-tuning solution is attractive because it reduces the need for other components (management stack) or the user to get involved. In this case self-tuning should be possible. We need to detect periods of I/O inactivity, for example tracking the number of in-flight requests and then setting a grace timer when it reaches zero. When the grace timer expires, we start streaming until the guest initiates I/O again. That detects idle I/O within a single QEMU guest, but you might have another guest running that's I/O bound which means that from an overall system throughput perspective, you really don't want to stream. I think libvirt might be able to do a better job here by looking at overall system I/O usage. But I'm not sure hence this RFC :-) Isn't this what block I/O controller cgroups is meant to solve? If you give vm-1 50% block bandwidth and vm-2 50% block bandwidth then vm-1 can do streaming without eating into vm-2's guaranteed bandwidth. Also, I'm not sure we should worry about the priority of the I/O too much: perhaps the user wants their vm to stream more than they want an unimportant local vm that is currently I/O bound to have all resources to itself. So I think it makes sense to defer this and not try for system-wide knowledge inside a QEMU process. Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
Am 07.09.2010 16:49, schrieb Anthony Liguori: Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. The way it's implemented in QED is that it's a compatible feature. This means that implementations are allowed to ignore it if they want to. It's really a suggestion. Well, the point is that I see no reason why an image should contain this suggestion. There's really nothing about an image that could reasonably indicate use this better with copy-on-read than with copy-on-write. It's a decision you make when using the image. So yes, you could have a run time switch that overrides the feature bit on disk and either forces copy-on-read on or off. Do we have a way to pass block drivers run time options? We'll get them with -blockdev. Today we're using colons for format specific and separate -drive options for generic things. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. To do it efficiently, it really needs to be in the format for the same reason that copy-on-write is part of the format. Copy-on-write is not part of the format, it's a way of how to use this format. Backing files are part of the format, and they are used for both copy-on-write and copy-on-read. Any driver implementing a format that has support for backing files should be able to implement copy-on-read. You need to understand the cluster boundaries in order to optimize the metadata updates. Sure, you can expose interfaces to the block layer to give all of this info but that's solving the same problem for doing block level copy-on-write. The other challenge is that for copy-on-read to be efficiently, you really need a format that can distinguish between unallocated sectors and zero sectors and do zero detection during the copy-on-read operation. Otherwise, if you have a 10G virtual disk with a backing file that's 1GB is size, copy-on-read will result in the leaf being 10G instead of ~1GB. That's a good point. But it's not a reason to make the interface specific to QED just because other formats would probably not implement it as efficiently. Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 09:55 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:51 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:33 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 2:41 PM, Anthony Liguori aligu...@linux.vnet.ibm.comwrote: The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: streamdevicesector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offsetimage_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A self-tuning solution is attractive because it reduces the need for other components (management stack) or the user to get involved. In this case self-tuning should be possible. We need to detect periods of I/O inactivity, for example tracking the number of in-flight requests and then setting a grace timer when it reaches zero. When the grace timer expires, we start streaming until the guest initiates I/O again. That detects idle I/O within a single QEMU guest, but you might have another guest running that's I/O bound which means that from an overall system throughput perspective, you really don't want to stream. I think libvirt might be able to do a better job here by looking at overall system I/O usage. But I'm not sure hence this RFC :-) Isn't this what block I/O controller cgroups is meant to solve? If you give vm-1 50% block bandwidth and vm-2 50% block bandwidth then vm-1 can do streaming without eating into vm-2's guaranteed bandwidth. That assumes you're capping I/O. But sometimes you care about overall system throughput more than you care about any individual VM. Another way to look at it may be, a user waits for a cron job that runs at midnight and starts streaming as necessary. However, the user wants to be able to interrupt the streaming should there been a sudden demand. If the user drives the streaming through an interface like I've specified, they're in full control. It's pretty simple to build a interfaces on top of this that implement stream as an aggressive or conservative background task too. Also, I'm not sure we should worry about the priority of the I/O too much: perhaps the user wants their vm to stream more than they want an unimportant local vm that is currently I/O bound to have all resources to itself. So I think it makes sense to defer this and not try for system-wide knowledge inside a QEMU process. Right, so that argues for an incremental interface like I started with :-) BTW, this whole discussion is also relevant for other background tasks like online defragmentation so keep that use-case in mind too. Regards, Anthony Liguori Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 7, 2010 at 3:57 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:49 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:34 PM, Kevin Wolfkw...@redhat.com wrote: Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. I agree that streaming should be generic, like block migration. The trivial generic implementation is: void bdrv_stream(BlockDriverState* bs) { for (sector = 0; sector bdrv_getlength(bs); sector += n) { if (!bdrv_is_allocated(bs, sector,n)) { Three problems here. First problem is that bdrv_is_allocated is synchronous. The second problem is that streaming makes the most sense when it's the smallest useful piece of work whereas bdrv_is_allocated() may return a very large range. You could cap it here but you then need to make sure that cap is at least cluster_size to avoid a lot of unnecessary I/O. The QED streaming implementation is 140 LOCs too so you quickly end up adding more code to the block formats to support these new interfaces than it takes to just implement it in the block format. Third problem is that streaming really requires being able to do zero write detection in a meaningful way. You don't want to always do zero write detection so you need another interface to mark a specific write as a write that should be checked for zeros. Good points. I agree that it is easiest to write features into the block driver, but there is a significant amount of code duplication, plus the barrier for enabling other block drivers with these features is increased. These points (except the lines of code argument) can be addressed with the proper extensions to the block driver interface. Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 07, 2010 at 08:41:44AM -0500, Anthony Liguori wrote: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: [snip] A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source IMHO, adding further network sockets is the one thing we absolutely don't want to do to migration. I don't much like the idea of launching extra daemons either. This is a bit involved and we could potentially automate some of this in qemu by launching qemu-nbd and providing commands to do some of this. Again though, I think the question is what type of interfaces would libvirt prefer? Low level interfaces + recipes on how to do high level things or higher level interfaces? I think it should be done entirely within the main QEMU migration socket. I know this isn't possible with the current impl, since it is unidirectional, preventing the target sending the source requests for specific data blocks. If we made migration socket bi-directional I think we could do it all within qemu with no external helpers or extra sockets 1. Create empty qed file on the destination with copy on read enable backing file pointing to a special 'migrate:' protocol 2. Run qemu -incoming on the destination with with the qed file 3. execute the migration 4. when migration completes, target QEMU continues streaming blocks from the soruce qemu. 5. when streaming is complete, source qemu can shutdown. Both your original proposal and mine here seem to have a pretty bad failure scenario though. After the cut-over point where the VM cpus start running on the destination QEMU, AFAICT, any failure on the source before block streaming complete leaves you dead in the water. The source VM no longer has up2date RAM contents and the destination VM does not yet have a complete disk image. Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:02 AM, Kevin Wolf wrote: Am 07.09.2010 16:49, schrieb Anthony Liguori: Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. The way it's implemented in QED is that it's a compatible feature. This means that implementations are allowed to ignore it if they want to. It's really a suggestion. Well, the point is that I see no reason why an image should contain this suggestion. There's really nothing about an image that could reasonably indicate use this better with copy-on-read than with copy-on-write. It's a decision you make when using the image. Copy-on-read is, in many cases, a property of the backing file because it suggests that the backing file is either very slow or potentially volatile. IOW, let's say I'm an image distributor and I want to provide my images in a QED format that actually streams the image from an http server. I could provide a QED file without a copy-on-read bit set but I'd really like to convey this information as part of the image. You can argue that I should provide a config file too that contained the copy-on-read flag set but you could make the same argument about backing files too. So yes, you could have a run time switch that overrides the feature bit on disk and either forces copy-on-read on or off. Do we have a way to pass block drivers run time options? We'll get them with -blockdev. Today we're using colons for format specific and separate -drive options for generic things. That's right. I think I'd rather wait for -blockdev. You need to understand the cluster boundaries in order to optimize the metadata updates. Sure, you can expose interfaces to the block layer to give all of this info but that's solving the same problem for doing block level copy-on-write. The other challenge is that for copy-on-read to be efficiently, you really need a format that can distinguish between unallocated sectors and zero sectors and do zero detection during the copy-on-read operation. Otherwise, if you have a 10G virtual disk with a backing file that's 1GB is size, copy-on-read will result in the leaf being 10G instead of ~1GB. That's a good point. But it's not a reason to make the interface specific to QED just because other formats would probably not implement it as efficiently. You really can't do as good of a job in the block layer because you have very little info about the characteristics of the disk image. Regards, Anthony Liguori Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On Tue, Sep 7, 2010 at 4:00 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:55 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:51 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:33 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 2:41 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: The interface for copy-on-read is just an option within qemu-img create. Streaming, on the other hand, requires a bit more thought. Today, I have a monitor command that does the following: streamdevice sector offset Which will try to stream the minimal amount of data for a single I/O operation and then return how many sectors were successfully streamed. The idea about how to drive this interface is a loop like: offset = 0; while offset image_size: wait_for_idle_time() count = stream(device, offset) offset += count Obviously, the wait_for_idle_time() requires wide system awareness. The thing I'm not sure about is 1) would libvirt want to expose a similar stream interface and let management software determine idle time 2) attempt to detect idle time on it's own and provide a higher level interface. If (2), the question then becomes whether we should try to do this within qemu and provide libvirt a higher level interface. A self-tuning solution is attractive because it reduces the need for other components (management stack) or the user to get involved. In this case self-tuning should be possible. We need to detect periods of I/O inactivity, for example tracking the number of in-flight requests and then setting a grace timer when it reaches zero. When the grace timer expires, we start streaming until the guest initiates I/O again. That detects idle I/O within a single QEMU guest, but you might have another guest running that's I/O bound which means that from an overall system throughput perspective, you really don't want to stream. I think libvirt might be able to do a better job here by looking at overall system I/O usage. But I'm not sure hence this RFC :-) Isn't this what block I/O controller cgroups is meant to solve? If you give vm-1 50% block bandwidth and vm-2 50% block bandwidth then vm-1 can do streaming without eating into vm-2's guaranteed bandwidth. That assumes you're capping I/O. But sometimes you care about overall system throughput more than you care about any individual VM. Another way to look at it may be, a user waits for a cron job that runs at midnight and starts streaming as necessary. However, the user wants to be able to interrupt the streaming should there been a sudden demand. If the user drives the streaming through an interface like I've specified, they're in full control. It's pretty simple to build a interfaces on top of this that implement stream as an aggressive or conservative background task too. Also, I'm not sure we should worry about the priority of the I/O too much: perhaps the user wants their vm to stream more than they want an unimportant local vm that is currently I/O bound to have all resources to itself. So I think it makes sense to defer this and not try for system-wide knowledge inside a QEMU process. Right, so that argues for an incremental interface like I started with :-) BTW, this whole discussion is also relevant for other background tasks like online defragmentation so keep that use-case in mind too. Right, I'm a little hesitant to get too far into discussing the management interface because I remember long threads about polling and async. I never fully read them but I bet some wisdom came out of them that applies here. There are two ways to do a long running (async?) task: 1. Multiple smaller pokes. Perhaps completion of a single poke is async. But the key is that the interface is incremental and driven by the management stack. 2. State. Turn on streaming and watch it go. You can find out its current state using another command which will tell you whether it is enabled/disabled and progress. Use a command to disable it. Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:03 AM, Daniel P. Berrange wrote: On Tue, Sep 07, 2010 at 08:41:44AM -0500, Anthony Liguori wrote: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: [snip] A related topic is block migration. Today we support pre-copy migration which means we transfer the block device and then do a live migration. Another approach is to do a live migration, and on the source, run a block server using image streaming on the destination to move the device. With QED, to implement this one would: 1) launch qemu-nbd on the source while the guest is running 2) create a qed file on the destination with copy-on-read enabled and a backing file using nbd: to point to the source qemu-nbd 3) run qemu -incoming on the destination with the qed file 4) execute the migration 5) when migration completes, begin streaming on the destination to complete the copy 6) when the streaming is complete, shut down the qemu-nbd instance on the source IMHO, adding further network sockets is the one thing we absolutely don't want to do to migration. I don't much like the idea of launching extra daemons either. One of the use cases I'm trying to accommodate is migration to free resources. By launching a qemu-nbd daemon, we can kill the source qemu process and free up all of the associated memory. This is a bit involved and we could potentially automate some of this in qemu by launching qemu-nbd and providing commands to do some of this. Again though, I think the question is what type of interfaces would libvirt prefer? Low level interfaces + recipes on how to do high level things or higher level interfaces? I think it should be done entirely within the main QEMU migration socket. I know this isn't possible with the current impl, since it is unidirectional, preventing the target sending the source requests for specific data blocks. If we made migration socket bi-directional I think we could do it all within qemu with no external helpers or extra sockets 1. Create empty qed file on the destination with copy on read enable backing file pointing to a special 'migrate:' protocol Why not just point migration and nbd to a unix domain socket and then multiplex the two protocols at a higher level? 2. Run qemu -incoming on the destination with with the qed file 3. execute the migration 4. when migration completes, target QEMU continues streaming blocks from the soruce qemu. 5. when streaming is complete, source qemu can shutdown. Both your original proposal and mine here seem to have a pretty bad failure scenario though. After the cut-over point where the VM cpus start running on the destination QEMU, AFAICT, any failure on the source before block streaming complete leaves you dead in the water. The source VM no longer has up2date RAM contents and the destination VM does not yet have a complete disk image. Yes. It's a trade off. However, pre-copy doesn't really change your likelihood of catastrophic failure because if you were going to fail in the source, it was going to happen before you completed the block transfer anyway. The advantage of post-copy is that you immediately free resources on the source so as a reaction to pressure from overcommit, it's tremendously useful. I still think pre-copy has it's place though. Regards, Anthony Liguori Regards, Daniel -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:05 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:57 PM, Anthony Liguori aligu...@linux.vnet.ibm.com wrote: On 09/07/2010 09:49 AM, Stefan Hajnoczi wrote: On Tue, Sep 7, 2010 at 3:34 PM, Kevin Wolfkw...@redhat.comwrote: Am 07.09.2010 15:41, schrieb Anthony Liguori: Hi, We've got copy-on-read and image streaming working in QED and before going much further, I wanted to bounce some interfaces off of the libvirt folks to make sure our final interface makes sense. Here's the basic idea: Today, you can create images based on base images that are copy on write. With QED, we also support copy on read which forces a copy from the backing image on read requests and write requests. In additional to copy on read, we introduce a notion of streaming a block device which means that we search for an unallocated region of the leaf image and force a copy-on-read operation. The combination of copy-on-read and streaming means that you can start a guest based on slow storage (like over the network) and bring in blocks on demand while also having a deterministic mechanism to complete the transfer. The interface for copy-on-read is just an option within qemu-img create. Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. Doing it this way has the additional advantage that you need no image format support for this, so we could implement copy-on-read for other formats, too. I agree that streaming should be generic, like block migration. The trivial generic implementation is: void bdrv_stream(BlockDriverState* bs) { for (sector = 0; sectorbdrv_getlength(bs); sector += n) { if (!bdrv_is_allocated(bs, sector,n)) { Three problems here. First problem is that bdrv_is_allocated is synchronous. The second problem is that streaming makes the most sense when it's the smallest useful piece of work whereas bdrv_is_allocated() may return a very large range. You could cap it here but you then need to make sure that cap is at least cluster_size to avoid a lot of unnecessary I/O. The QED streaming implementation is 140 LOCs too so you quickly end up adding more code to the block formats to support these new interfaces than it takes to just implement it in the block format. Third problem is that streaming really requires being able to do zero write detection in a meaningful way. You don't want to always do zero write detection so you need another interface to mark a specific write as a write that should be checked for zeros. Good points. I agree that it is easiest to write features into the block driver, but there is a significant amount of code duplication, There's two ways to attack code duplication. The first is to move the feature into block.c and add interfaces to the block drivers to support it. The second is to keep it in qed.c but to abstract out things that could really be common to multiple drivers (like the find_cluster functionality and some of the request handling functionality). I prefer the later approach because it keeps a high quality implementation of copy-on-read whereas the former is almost certainly going to dumb down the implementation. plus the barrier for enabling other block drivers with these features is increased. These points (except the lines of code argument) can be addressed with the proper extensions to the block driver interface. Regards, Anthony Liguori Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:20 AM, Kevin Wolf wrote: Am 07.09.2010 17:11, schrieb Anthony Liguori: On 09/07/2010 10:02 AM, Kevin Wolf wrote: Am 07.09.2010 16:49, schrieb Anthony Liguori: Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. The way it's implemented in QED is that it's a compatible feature. This means that implementations are allowed to ignore it if they want to. It's really a suggestion. Well, the point is that I see no reason why an image should contain this suggestion. There's really nothing about an image that could reasonably indicate use this better with copy-on-read than with copy-on-write. It's a decision you make when using the image. Copy-on-read is, in many cases, a property of the backing file because it suggests that the backing file is either very slow or potentially volatile. The simple copy-on-read without actively streaming the rest of the image is not enough anyway for volatile backing files. But as a web site owner, it's extremely useful for me to associate copy-on-read with an image because it significantly reduces my bandwidth. I have a hard time believing this isn't a valuable use-case and not one that's actually pretty common. IOW, let's say I'm an image distributor and I want to provide my images in a QED format that actually streams the image from an http server. I could provide a QED file without a copy-on-read bit set but I'd really like to convey this information as part of the image. You can argue that I should provide a config file too that contained the copy-on-read flag set but you could make the same argument about backing files too. No. The image is perfectly readable when using COW instead of COR. On the other hand, it's completely meaningless without its backing file. N.B. the whole concept of compat features in QED is that if the features are ignored, the image is still perfectly readable. It's extra information that let's an implementation to smarter things with a given image. So yes, you could have a run time switch that overrides the feature bit on disk and either forces copy-on-read on or off. Do we have a way to pass block drivers run time options? We'll get them with -blockdev. Today we're using colons for format specific and separate -drive options for generic things. That's right. I think I'd rather wait for -blockdev. Well, then I consider -blockdev a dependency of QED (the copy-on-read part at least) and we can't merge it before we have -blockdev. If we determine that having copy-on-read be a part of the image is universally a bad idea, then I'd agree with you. Keep in mind, I don't expect to merge the cor or streaming stuff with the first merge of QED. I'm still not convinced that having cor as a compat feature is a bad idea though. You really can't do as good of a job in the block layer because you have very little info about the characteristics of the disk image. I'm not saying that the generic block layer should implement copy-on-read. I just think that it should pass a run-time option to the driver - maybe just a BDRV_O_COPY_ON_READ flag - instead of having the information in the image file. From a user perspective it should look the same for qed, qcow2 and whatever else (like copy-on-write today) Okay, the only place I'm disagreeing slightly is that I think an image format should be able to request copy_on_read such that the default behavior if an explicit flag isn't specified is to do what the image suggests we do. Regards, Anthony Liguori Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
Am 07.09.2010 17:30, schrieb Anthony Liguori: On 09/07/2010 10:20 AM, Kevin Wolf wrote: Am 07.09.2010 17:11, schrieb Anthony Liguori: Copy-on-read is, in many cases, a property of the backing file because it suggests that the backing file is either very slow or potentially volatile. The simple copy-on-read without actively streaming the rest of the image is not enough anyway for volatile backing files. But as a web site owner, it's extremely useful for me to associate copy-on-read with an image because it significantly reduces my bandwidth. I have a hard time believing this isn't a valuable use-case and not one that's actually pretty common. As a web site user, I don't necessarily want you to control the behaviour of my qemu. :-) But I do see your point there. You really can't do as good of a job in the block layer because you have very little info about the characteristics of the disk image. I'm not saying that the generic block layer should implement copy-on-read. I just think that it should pass a run-time option to the driver - maybe just a BDRV_O_COPY_ON_READ flag - instead of having the information in the image file. From a user perspective it should look the same for qed, qcow2 and whatever else (like copy-on-write today) Okay, the only place I'm disagreeing slightly is that I think an image format should be able to request copy_on_read such that the default behavior if an explicit flag isn't specified is to do what the image suggests we do. Maybe we can agree on that. I'm not completely decided yet if allowing the image to contain such a hint is a good or a bad thing. Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] Why is avahi-daemon being started?
[adding libvir-list] On 09/07/2010 07:42 AM, Tom Horsley wrote: On Tue, 07 Sep 2010 13:51:09 +0100 Adam Williamson wrote: I did find a Should-start: avahi-daemon comment in the libvirtd init script, so maybe that is the source. Shouldn't be. 'Should-start' means 'if this other service is enabled, it should be started before this one': it's not a strict dependency, 'this other service MUST be started before this one'. Yea, changing the comment there didn't fix anything, but it was a good hint since I finally found the mdns_adv setting in the libvirtd.conf file and uncommenting it did finally squash avahi-daemon :-). I'm wondering if this means that libvirt should change any of its policies about auto-starting avahi-daemon, or at the very least, if there is a documentation shortcoming on why libvirt defaults to enabling this and when you might want to change that default. -- Eric Blake ebl...@redhat.com+1-801-349-2682 Libvirt virtualization library http://libvirt.org -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:39 AM, Kevin Wolf wrote: Am 07.09.2010 17:30, schrieb Anthony Liguori: On 09/07/2010 10:20 AM, Kevin Wolf wrote: Am 07.09.2010 17:11, schrieb Anthony Liguori: Copy-on-read is, in many cases, a property of the backing file because it suggests that the backing file is either very slow or potentially volatile. The simple copy-on-read without actively streaming the rest of the image is not enough anyway for volatile backing files. But as a web site owner, it's extremely useful for me to associate copy-on-read with an image because it significantly reduces my bandwidth. I have a hard time believing this isn't a valuable use-case and not one that's actually pretty common. As a web site user, I don't necessarily want you to control the behaviour of my qemu. :-) That's why I understand your argument about -blockdev and making sure all compat features can be overridden. I'm happy with that as a requirement.Okay, the only place I'm disagreeing slightly is that I think an image format should be able to request copy_on_read such that the default behavior if an explicit flag isn't specified is to do what the image suggests we do. Maybe we can agree on that. I'm not completely decided yet if allowing the image to contain such a hint is a good or a bad thing. It's a tough space. We don't want to include crazy amounts of metadata (and basically become OVF) but there's metadata that we would like to have. backing_format is a good example. It's a suggestion and it's something you really want to let a user override. Regards, Anthony Liguori Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] Why is avahi-daemon being started?
On Tue, Sep 07, 2010 at 09:38:15AM -0600, Eric Blake wrote: [adding libvir-list] On 09/07/2010 07:42 AM, Tom Horsley wrote: On Tue, 07 Sep 2010 13:51:09 +0100 Adam Williamson wrote: I did find a Should-start: avahi-daemon comment in the libvirtd init script, so maybe that is the source. Shouldn't be. 'Should-start' means 'if this other service is enabled, it should be started before this one': it's not a strict dependency, 'this other service MUST be started before this one'. Yea, changing the comment there didn't fix anything, but it was a good hint since I finally found the mdns_adv setting in the libvirtd.conf file and uncommenting it did finally squash avahi-daemon :-). I'm wondering if this means that libvirt should change any of its policies about auto-starting avahi-daemon, or at the very least, if there is a documentation shortcoming on why libvirt defaults to enabling this and when you might want to change that default. libvirtd has never explicitly auto-started avahi. libvirtd uses the avahi client library and gives it a callback to be invoked whenever a connection to the avahi daemon is established. With the old init system, if avahi wasn't started on boot, the callback isn't invoked and so libvirt never registers its mdns service. The sysadmin can start avahi at any time later, and libvirt will automatically register with it. With system-d it sounds like creating the avahi client will always immediately activate the avahi service. I think there perhaps needs to be a way to prevent autostart by the avahi client library Regards, Daniel -- |: Red Hat, Engineering, London-o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org-o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
Am 07.09.2010 17:11, schrieb Anthony Liguori: On 09/07/2010 10:02 AM, Kevin Wolf wrote: Am 07.09.2010 16:49, schrieb Anthony Liguori: Shouldn't it be a runtime option? You can use the very same image with copy-on-read or copy-on-write and it will behave the same (execpt for performance), so it's not an inherent feature of the image file. The way it's implemented in QED is that it's a compatible feature. This means that implementations are allowed to ignore it if they want to. It's really a suggestion. Well, the point is that I see no reason why an image should contain this suggestion. There's really nothing about an image that could reasonably indicate use this better with copy-on-read than with copy-on-write. It's a decision you make when using the image. Copy-on-read is, in many cases, a property of the backing file because it suggests that the backing file is either very slow or potentially volatile. The simple copy-on-read without actively streaming the rest of the image is not enough anyway for volatile backing files. IOW, let's say I'm an image distributor and I want to provide my images in a QED format that actually streams the image from an http server. I could provide a QED file without a copy-on-read bit set but I'd really like to convey this information as part of the image. You can argue that I should provide a config file too that contained the copy-on-read flag set but you could make the same argument about backing files too. No. The image is perfectly readable when using COW instead of COR. On the other hand, it's completely meaningless without its backing file. So yes, you could have a run time switch that overrides the feature bit on disk and either forces copy-on-read on or off. Do we have a way to pass block drivers run time options? We'll get them with -blockdev. Today we're using colons for format specific and separate -drive options for generic things. That's right. I think I'd rather wait for -blockdev. Well, then I consider -blockdev a dependency of QED (the copy-on-read part at least) and we can't merge it before we have -blockdev. You need to understand the cluster boundaries in order to optimize the metadata updates. Sure, you can expose interfaces to the block layer to give all of this info but that's solving the same problem for doing block level copy-on-write. The other challenge is that for copy-on-read to be efficiently, you really need a format that can distinguish between unallocated sectors and zero sectors and do zero detection during the copy-on-read operation. Otherwise, if you have a 10G virtual disk with a backing file that's 1GB is size, copy-on-read will result in the leaf being 10G instead of ~1GB. That's a good point. But it's not a reason to make the interface specific to QED just because other formats would probably not implement it as efficiently. You really can't do as good of a job in the block layer because you have very little info about the characteristics of the disk image. I'm not saying that the generic block layer should implement copy-on-read. I just think that it should pass a run-time option to the driver - maybe just a BDRV_O_COPY_ON_READ flag - instead of having the information in the image file. From a user perspective it should look the same for qed, qcow2 and whatever else (like copy-on-write today) Kevin -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [Qemu-devel] QEMU interfaces for image streaming and post-copy block migration
On 09/07/2010 10:09 AM, Stefan Hajnoczi wrote: Right, so that argues for an incremental interface like I started with :-) BTW, this whole discussion is also relevant for other background tasks like online defragmentation so keep that use-case in mind too. Right, I'm a little hesitant to get too far into discussing the management interface because I remember long threads about polling and async. I never fully read them but I bet some wisdom came out of them that applies here. There are two ways to do a long running (async?) task: 1. Multiple smaller pokes. Perhaps completion of a single poke is async. But the key is that the interface is incremental and driven by the management stack. 2. State. Turn on streaming and watch it go. You can find out its current state using another command which will tell you whether it is enabled/disabled and progress. Use a command to disable it. If everyone is going to do (1) by just doing a tight loop or just using the same simple mechanism (a sleep(5)), then I agree, we should do (2). I can envision people wanting to do very complex decisions about the right time to do the next poke though and I'm looking for feedback about what other people think. I expected people to do complex heuristics with respect to migration convergence but in reality, I don't think anyone does today. So while I generally like being flexible, I realize that too much flexibility isn't always a good thing :-) Regards, Anthony Liguori Stefan -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] mingw: match recent changes in spec file
On 09/06/2010 04:34 AM, Daniel Veillard wrote: On Fri, Sep 03, 2010 at 02:12:47PM -0600, Eric Blake wrote: These changes allow './autobuild.sh' to complete again, when a full mingw cross-compilation is available on Fedora. * libvirt.spec.in (%file): List new installed files. * configure.ac (with_init_script): Assume default of none when cross-compiling. Looks fine to me, ACK Thanks; pushed. -- Eric Blake ebl...@redhat.com+1-801-349-2682 Libvirt virtualization library http://libvirt.org -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] esx: Fall back to path as key when QueryVirtualDiskUuid isn't available
2010/9/6 Daniel Veillard veill...@redhat.com: On Sat, Sep 04, 2010 at 01:36:08AM +0200, Matthias Bolte wrote: QueryVirtualDiskUuid is only available on an ESX(i) server. vCenter returns an NotImplemented fault and a GSX server is missing the VirtualDiskManager completely. Therefore only use QueryVirtualDiskUuid with an ESX(i) server and fall back to path as storage volume key for vCenter and GSX server. ACK, doing the dynamic allocation of the UUId is a good thing too Daniel Thanks, pushed. Matthias -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] esx: Use SessionIsActive when available
2010/9/6 Daniel Veillard veill...@redhat.com: On Sun, Sep 05, 2010 at 11:41:39PM +0200, Matthias Bolte wrote: --- src/esx/esx_vi.c | 122 +++--- src/esx/esx_vi.h | 1 + 2 files changed, 62 insertions(+), 61 deletions(-) Looks like information is available at runtime, so that sounds the right way. But a bit of details in the commit log would be nice the old code was just statically compiling it off right ? ACK Daniel True, this deserves a more verbose commit message. I added this and pushed the result: Before this commit SessionIsActive was not used because ESX(i) doesn't implement it. vCenter supports SessionIsActive, so use it here, but keep the fall back mechanism for ESX(i) and GSX Matthias -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH] Add nwfilter support to UML driver
On 09/07/2010 02:50 AM, Soren Hansen wrote: On 07-09-2010 10:32, Daniel Veillard wrote: We are supposed to be in feature freeze mode this week, Apologies. I didn't realise. Where could I have learned this? About once a month, the list gets a message announcing an upcoming release; here's the most recent one: https://www.redhat.com/archives/libvir-list/2010-September/msg3.html -- Eric Blake ebl...@redhat.com+1-801-349-2682 Libvirt virtualization library http://libvirt.org -- libvir-list mailing list libvir-list@redhat.com https://www.redhat.com/mailman/listinfo/libvir-list