[Qemu-devel] [RFC PATCH] tests: rtl8139: test timers and interrupt
Test behaviour of timers and interrupts related to timeouts. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- tests/Makefile | 2 +- tests/rtl8139-test.c | 164 +++ 2 files changed, 165 insertions(+), 1 deletion(-) This patch was derived from a test I did while implementing timer in rtl8139 code. Now that there is support for integrated testing I converted it. The test was tested on a real NIC. As if it's the first test I wrote I don't know if syntax and details are fine. For instance should I remove nop test? Should I split my test? diff --git a/tests/Makefile b/tests/Makefile index e4ddb6a..8858407 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -320,7 +320,7 @@ tests/tmp105-test$(EXESUF): tests/tmp105-test.o $(libqos-omap-obj-y) tests/i440fx-test$(EXESUF): tests/i440fx-test.o $(libqos-pc-obj-y) tests/fw_cfg-test$(EXESUF): tests/fw_cfg-test.o $(libqos-pc-obj-y) tests/e1000-test$(EXESUF): tests/e1000-test.o -tests/rtl8139-test$(EXESUF): tests/rtl8139-test.o +tests/rtl8139-test$(EXESUF): tests/rtl8139-test.o $(libqos-pc-obj-y) tests/pcnet-test$(EXESUF): tests/pcnet-test.o tests/eepro100-test$(EXESUF): tests/eepro100-test.o tests/vmxnet3-test$(EXESUF): tests/vmxnet3-test.o diff --git a/tests/rtl8139-test.c b/tests/rtl8139-test.c index f6a1be3..87f1792 100644 --- a/tests/rtl8139-test.c +++ b/tests/rtl8139-test.c @@ -10,19 +10,183 @@ #include glib.h #include string.h #include libqtest.h +#include libqos/pci-pc.h #include qemu/osdep.h +#include qemu-common.h /* Tests only initialization so far. TODO: Replace with functional tests */ static void nop(void) { } +#define CLK 3300 +#define NS_PER_SEC 10ULL + +static QPCIBus *pcibus; +static QPCIDevice *dev = NULL; +static void *dev_base = NULL; + +static void save_fn(QPCIDevice *dev, int devfn, void *data) +{ +QPCIDevice **pdev = (QPCIDevice **) data; + +*pdev = dev; +} + +static QPCIDevice *get_device(void) +{ +QPCIDevice *dev; + +pcibus = qpci_init_pc(); +qpci_device_foreach(pcibus, 0x10ec, 0x8139, save_fn, dev); +g_assert(dev != NULL); + +return dev; +} + +#define PORT(name,len,val) \ +static unsigned __attribute__((unused)) in_##name(void) { \ +unsigned res = qpci_io_read##len(dev,dev_base+(val)); \ +g_test_message(*%s - %x\n, #name, res); \ +return res; \ +} \ +static void out_##name(unsigned v) { \ +g_test_message(%x - *%s\n, v, #name); \ +qpci_io_write##len(dev,dev_base+(val),v); \ +} + +PORT(Timer,l,0x48) +PORT(IntrMask,w,0x3c) +PORT(IntrStatus,w,0x3E) +PORT(TimerInt,l,0x54) + +#define fatal(...) do { g_test_message(__VA_ARGS__); g_assert(0); } while(0) + +static void test_timer(void) +{ +const unsigned from = 0.95 * CLK; +const unsigned to = 1.6 * CLK; + +out_IntrMask(0); + +in_IntrStatus(); +in_Timer(); +in_Timer(); + +// Test 1. test counter continue and continue +out_TimerInt(0); // disable timer +out_IntrStatus(0x4000); +out_Timer(12345); // reset timer to 0 +unsigned curr = in_Timer(); +unsigned cnt; +if (curr 0.1 * CLK) +fatal(time too big %u\n, curr); +for (cnt=0;;) { +clock_step(1 * NS_PER_SEC); +unsigned prev = curr; +curr = in_Timer(); + +// test skip is in a specific range +unsigned diff = (curr-prev) 0xu; +if (diff from || diff to) +fatal(Invalid diff %u (%u-%u)\n, diff, from,to); +if (curr prev ++cnt == 3) +break; +} + +// Test 2. Check we didn't get an interrupt with TimerInt == 0 +if (in_IntrStatus() 0x4000) +fatal(got an interrupt\n); + +// Test 3. Setting TimerInt to 1 and Timer to 0 get interrupt +out_TimerInt(1); +out_Timer(0); +clock_step(40); +if ((in_IntrStatus() 0x4000) == 0) +fatal(we should have an interrupt here!\n); + +// Test 3. Check acknowledge +out_IntrStatus(0x4000); +if (in_IntrStatus() 0x4000) +fatal(got an interrupt\n); + +// Test. Status set after Timer reset +out_Timer(0); +out_TimerInt(0); +out_IntrStatus(0x4000); +curr = in_Timer(); +out_TimerInt(curr + 0.5 * CLK); +clock_step(1 * NS_PER_SEC); +out_Timer(0); +if ((in_IntrStatus() 0x4000) == 0) +fatal(we should have an interrupt here!\n); + +// Test. Status set after TimerInt reset +out_Timer(0); +out_TimerInt(0); +out_IntrStatus(0x4000); +curr = in_Timer(); +out_TimerInt(curr + 0.5 * CLK); +clock_step(1 * NS_PER_SEC); +out_TimerInt(0); +if ((in_IntrStatus() 0x4000) == 0) +fatal(we should have an interrupt here!\n); + +// Test 4. Increment TimerInt we should see an interrupt +curr = in_Timer(); +unsigned next = curr + 5.0 * CLK; +out_TimerInt(next); +for (cnt=0;;) { +clock_step(1 * NS_PER_SEC); +unsigned prev = curr; +curr = in_Timer(); +unsigned diff
Re: [Qemu-devel] [Xen-devel] Project idea: make QEMU more flexible
On Mon, 2014-01-06 at 12:54 +, Wei Liu wrote: Hi all This idea is to modify QEMU's Makefiles, plus implementing some stubs to make it possible to tailor QEMU to a smaller binary. The current setup for Xen on X86 is to build i386-softmmu, and uses this single binary for two purposes: 1. serves as device emulator for HVM guest. 2. serves as PV driver backend for PV guest. Perhaps even KVM would benefit too. Are you sure we don't emulate any instruction for other purposes? Either case CPU emulation is never used because Xen handles that already. So we are in fact having a load of unused code in QEMU build. What I have in mind is to build a QEMU binary which: 1. does not include CPU emulation code at all. 2. only includes components that's useful (what's useful is TBD). And the rationales behind this idea are: 1. Reduce memory footprint. One usecase would be running Xen on embedded platform (X86 or ARM). We would expect the system has very limited resources. The smaller the binary, the better. 2. It doesn't make sense to have i386 emulation on ARM platform. Arguably nobody can prevent user from running i386 emulator on ARM platform, but it doesn't make sense in Xen's setup where QEMU is only used as PV device backend on ARM. 3. Security concern. It's much easier to audit small code base. Please note that I'm not proposing to invalidate all the other usecases. I'm only speaking with my Xen developer's hat on, aiming to make QEMU more flexible. Down to implementation level I only need to (hopefully) add a few stubs and create some new CONFIG_* options and move a few things around. It might not be as intrusive as one thinks. In fact I've already hacked a prototype during Christmas. What's I've done so far: 1. create target-null which only has some stubs to CPU emulation framework. 2. add a few lines to configure / Makefiles*, create default-configs/null-softmmu Finally I got a qemu-system-null. And the effect is immediately visible -- the size of QEMU binary shrinked from 13MB to 7.6MB. I haven't really looked at what device emulation code can be removed so the size can even be made smaller. Well, I would prefer something like a i386-nocpu. The system qemu emulate is still specific to a given architecture. You can't emulate ARM with the same executable. So we'd add default-configs/arm-nocpu.mak and default-config/x86_64-nocpu.mak (i386 don't make much sense anymore). What do you think about this idea? Thanks Wei. Frediano
Re: [Qemu-devel] [RFC PATCH] vga: Start supporting resolution not multiple of 16 correctly.
2013/7/23 Gerd Hoffmann kra...@redhat.com Hi, Tested-by: Fabio Fantoni fabio.fant...@m2r.biz I tested it for a long time with spice on xen (because qxl will be fully working only after adding SSE support on hvm domUs). It works, I think it is good to add this and the respective vgabios patch on upstream. case VBE_DISPI_INDEX_XRES: -if ((val = VBE_DISPI_MAX_XRES) ((val 7) == 0)) { +if ((val = VBE_DISPI_MAX_XRES) ((val 1) == 0)) { s-vbe_regs[s-vbe_index] = val; } break; It's not that simple. With 32bit depths common today it will work fine, but for lower depths (especially those lower than 8bit) this will give you broken scanline alignment. cheers, Gerd In the card I tested the scanline is keep aligned but for this reason is not directly computed by maxx * bits but is something bigger. Frediano
Re: [Qemu-devel] [RFC PATCH] vga: Start supporting resolution not multiple of 16 correctly.
Modern notebook support 1366x768 resolution. The resolution width is not multiple of 16 causing some problems. QEMU VGA emulation requires width resolution to be multiple of 8. VNC implementation requires width resolution to be multiple of 16. Signed-off-by: Frediano Ziglio frediano.zig...@citrix.com --- hw/display/vga.c |2 +- ui/vnc.c | 34 +++--- 2 files changed, 20 insertions(+), 16 deletions(-) Updates: - rebased - fixed style problems - fixed typos in comment Attached patch for last vgabios in order to get some new resolutions. Still had no time to test deeply. diff --git a/hw/display/vga.c b/hw/display/vga.c index 21a108d..0053b0f 100644 --- a/hw/display/vga.c +++ b/hw/display/vga.c @@ -648,7 +648,7 @@ void vbe_ioport_write_data(void *opaque, uint32_t addr, uint32_t val) } break; case VBE_DISPI_INDEX_XRES: -if ((val = VBE_DISPI_MAX_XRES) ((val 7) == 0)) { +if ((val = VBE_DISPI_MAX_XRES) ((val 1) == 0)) { s-vbe_regs[s-vbe_index] = val; } break; diff --git a/ui/vnc.c b/ui/vnc.c index dfc7459..2a2bb90 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -911,26 +911,30 @@ static int vnc_update_client(VncState *vs, int has_dirty) for (y = 0; y height; y++) { int x; int last_x = -1; -for (x = 0; x width / 16; x++) { -if (test_and_clear_bit(x, vs-dirty[y])) { +for (x = 0; x width; x += 16) { +if (test_and_clear_bit(x/16, vs-dirty[y])) { if (last_x == -1) { last_x = x; } } else { if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, -height); +int h = find_and_clear_dirty_height(vs, y, last_x/16, +x/16, height); -n += vnc_job_add_rect(job, last_x * 16, y, - (x - last_x) * 16, h); +n += vnc_job_add_rect(job, last_x, y, + (x - last_x), h); } last_x = -1; } } if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, height); -n += vnc_job_add_rect(job, last_x * 16, y, - (x - last_x) * 16, h); +int h = find_and_clear_dirty_height(vs, y, last_x/16, x/16, +height); +if (x width) { +x = width; +} +n += vnc_job_add_rect(job, last_x, y, + (x - last_x), h); } } @@ -1861,7 +1865,7 @@ static void framebuffer_update_request(VncState *vs, int incremental, int w, int h) { int i; -const size_t width = surface_width(vs-vd-ds) / 16; +const size_t width = (surface_width(vs-vd-ds)+15) / 16; const size_t height = surface_height(vs-vd-ds); if (y_position height) { @@ -2686,10 +2690,6 @@ static int vnc_refresh_server_surface(VncDisplay *vd) * Check and copy modified bits from guest to server surface. * Update server dirty map. */ -cmp_bytes = 64; -if (cmp_bytes vnc_server_fb_stride(vd)) { -cmp_bytes = vnc_server_fb_stride(vd); -} if (vd-guest.format != VNC_SERVER_FB_FORMAT) { int width = pixman_image_get_width(vd-server); tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width); @@ -2710,8 +2710,12 @@ static int vnc_refresh_server_surface(VncDisplay *vd) } server_ptr = server_row; -for (x = 0; x + 15 width; +cmp_bytes = 64; +for (x = 0; x width; x += 16, guest_ptr += cmp_bytes, server_ptr += cmp_bytes) { +if (width - x 16) { +cmp_bytes = 4 * (width - x); +} if (!test_and_clear_bit((x / 16), vd-guest.dirty[y])) continue; if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) -- 1.7.10.4 2013/5/10 Andreas Färber afaer...@suse.de Am 15.03.2013 19:14, schrieb Frediano Ziglio: Modern notebook support 136x768 resolution. The resolution width is 1366? not multiple of 16 causing some problems. a multiple? (me not a native English speaker) Qemu VGA emulation require width resolution to be multiple of 8. QEMU VNC implementation require width resolution to be multiple of 16. requires or implementations This patch remove these limits. Was tested with a Windows machine
[Qemu-devel] [RFC PATCH] vga: Start supporting resolution not multiple of 16 correctly.
Modern notebook support 136x768 resolution. The resolution width is not multiple of 16 causing some problems. Qemu VGA emulation require width resolution to be multiple of 8. VNC implementation require width resolution to be multiple of 16. This patch remove these limits. Was tested with a Windows machine with standard vga and 1366x768 as resolution. I had to update vgabios as version in qemu (pc-bios/vgabios-stdvga.bin) is quite old. I also had to add some patches on top of VGABIOS 0.7a to add some new resolutions. I have some doubt about this patch - are other UI (sdl, cocoa, qxl) happy if resolution is not multiple of 16 ? - scanline is computed exactly without any alignment (so 1366 8 bit is 1366 bytes) while getting vesa information from a laptop it seems to use some kind of alignment (if became 0x580 which is 1408 bytes). Perhaps should I change either VGABIOS and Qemu to make this alignment? Signed-off-by: Frediano Ziglio frediano.zig...@citrix.com --- hw/vga.c |2 +- ui/vnc.c | 27 +-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/hw/vga.c b/hw/vga.c index 1caf23d..d229f06 100644 --- a/hw/vga.c +++ b/hw/vga.c @@ -651,7 +651,7 @@ void vbe_ioport_write_data(void *opaque, uint32_t addr, uint32_t val) } break; case VBE_DISPI_INDEX_XRES: -if ((val = VBE_DISPI_MAX_XRES) ((val 7) == 0)) { +if ((val = VBE_DISPI_MAX_XRES) ((val 1) == 0)) { s-vbe_regs[s-vbe_index] = val; } break; diff --git a/ui/vnc.c b/ui/vnc.c index ff4e2ae..328d14d 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -907,26 +907,27 @@ static int vnc_update_client(VncState *vs, int has_dirty) for (y = 0; y height; y++) { int x; int last_x = -1; -for (x = 0; x width / 16; x++) { -if (test_and_clear_bit(x, vs-dirty[y])) { +for (x = 0; x width; x += 16) { +if (test_and_clear_bit(x/16, vs-dirty[y])) { if (last_x == -1) { last_x = x; } } else { if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, +int h = find_and_clear_dirty_height(vs, y, last_x/16, x/16, height); -n += vnc_job_add_rect(job, last_x * 16, y, - (x - last_x) * 16, h); +n += vnc_job_add_rect(job, last_x, y, + (x - last_x), h); } last_x = -1; } } if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, height); -n += vnc_job_add_rect(job, last_x * 16, y, - (x - last_x) * 16, h); +int h = find_and_clear_dirty_height(vs, y, last_x/16, x/16, height); +if (x width) x = width; +n += vnc_job_add_rect(job, last_x, y, + (x - last_x), h); } } @@ -1771,7 +1772,7 @@ static void framebuffer_update_request(VncState *vs, int incremental, int w, int h) { int i; -const size_t width = ds_get_width(vs-ds) / 16; +const size_t width = (ds_get_width(vs-ds)+15) / 16; if (y_position ds_get_height(vs-ds)) y_position = ds_get_height(vs-ds); @@ -2595,10 +2596,6 @@ static int vnc_refresh_server_surface(VncDisplay *vd) * Check and copy modified bits from guest to server surface. * Update server dirty map. */ -cmp_bytes = 64; -if (cmp_bytes vnc_server_fb_stride(vd)) { -cmp_bytes = vnc_server_fb_stride(vd); -} if (vd-guest.format != VNC_SERVER_FB_FORMAT) { int width = pixman_image_get_width(vd-server); tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width); @@ -2619,8 +2616,10 @@ static int vnc_refresh_server_surface(VncDisplay *vd) } server_ptr = server_row; -for (x = 0; x + 15 width; +cmp_bytes = 64; +for (x = 0; x width; x += 16, guest_ptr += cmp_bytes, server_ptr += cmp_bytes) { +if (width - x 16) cmp_bytes = 4 * (width - x); if (!test_and_clear_bit((x / 16), vd-guest.dirty[y])) continue; if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) -- 1.7.10.4
[Qemu-devel] Fix invalidate if memory requested was not bucket aligned
When memory is mapped in qemu_map_cache with lock != 0 a reverse mapping is created pointing to the virtual address of location requested. The cached mapped entry is saved in last_address_vaddr with the memory location of the base virtual address (without bucket offset). However when this entry is invalidated the virtual address saved in the reverse mapping is used. This cause that the mapping is freed but the last_address_vaddr is not reset. Signed-off-by: Frediano Ziglio frediano.zig...@citrix.com --- xen-mapcache.c |9 + 1 files changed, 5 insertions(+), 4 deletions(-) diff --git a/xen-mapcache.c b/xen-mapcache.c index 59ba085..9cd6db3 100644 --- a/xen-mapcache.c +++ b/xen-mapcache.c @@ -320,10 +320,6 @@ void xen_invalidate_map_cache_entry(uint8_t *buffer) target_phys_addr_t size; int found = 0; -if (mapcache-last_address_vaddr == buffer) { -mapcache-last_address_index = -1; -} - QTAILQ_FOREACH(reventry, mapcache-locked_entries, next) { if (reventry-vaddr_req == buffer) { paddr_index = reventry-paddr_index; @@ -342,6 +338,11 @@ void xen_invalidate_map_cache_entry(uint8_t *buffer) QTAILQ_REMOVE(mapcache-locked_entries, reventry, next); g_free(reventry); +if (mapcache-last_address_index == paddr_index) { +mapcache-last_address_index = -1; +mapcache-last_address_vaddr = NULL; +} + entry = mapcache-entry[paddr_index % mapcache-nr_buckets]; while (entry (entry-paddr_index != paddr_index || entry-size != size)) { pentry = entry; -- 1.7.5.4
Re: [Qemu-devel] Qemu crashed with lsi booting
2012/7/27 Paolo Bonzini pbonz...@redhat.com: Il 26/07/2012 10:31, Frediano Ziglio ha scritto: sudo ./x86_64-softmmu/qemu-system-x86_64 -m 1024 -hda test.qcow -device lsi -drive file=/dev/sdb,if=none,id=XXX -device scsi-block,drive=XXX -enable-kvm -bios ~/seabios/out/bios.bin -serial file:out.txt lsi_scsi: error: Multiple IO pending for request 0x7fd1075bf100 qemu-system-x86_64: /home/fredianoz/qemu/hw/lsi53c895a.c:774: lsi_do_command: Assertion `s-current == ((void *)0)' failed. (sometimes I don't get the Multiple IO ending line). I'm using a recent SeaBIOS which support booting from LSI SCSI. Qemu version $ git branch -v * master 61dc008 Revert audio: Make PC speaker audio card available by default I'm using SeaBIOS commit 9d6bac1d32b72cdf7c0ad009c1371a2e69084de3 (some minor changes in order to support 4k sectors). Can you share them? 4k sectors are not supported by BIOS at all AFAIK... Does virtio-scsi work with those changes? Adding some debugging to SeaBIOS lsi code seems that drivers send initial INQUIRY request but after that all requests have some problems and lead to a lot of reset command. Can you gather tracing output for the following events: scsi_req_data scsi_req_dequeue scsi_req_parsed scsi_req_parse_bad scsi_req_build_sense scsi_inquiry scsi_test_unit_ready (Perhaps you can also instrument scsi_req_cancel for tracing). The timing doesn't matter, so you can use the stderr backend: --enable-trace-backend=stderr Place the above list in a file (one tracepoint per line) and then start QEMU with -trace events=/path/to/file.txt. You could also try github.com/bonzini/qemu.git, branch scsi-next (it shouldn't fix anything, but I added a couple more assertions). Paolo It's currently a quite complicated setup. I used a virtual SCSI disk. However Linux is happy with it and does not crash. Also I got an assert even using scsi-hd directly (I had to disable multipathd). It seems that the LSI Qemu driver assert when code try to send two commands (second without waiting first completion). Attached error with trace enabled (err.txt) and serial output with SeaBIOS debug level set to 8. To support these 4kb disk in SeaBIOS I just changed the line in src/blockcmd.c if (drive-blksize != DISK_SECTOR_SIZE) { to if (drive-blksize != DISK_SECTOR_SIZE drive-blksize != 4096) { Frediano serial.txt.gz Description: GNU Zip compressed data err.txt.gz Description: GNU Zip compressed data
[Qemu-devel] Qemu crashed with lsi booting
I always get an assert launching this command sudo ./x86_64-softmmu/qemu-system-x86_64 -m 1024 -hda test.qcow -device lsi -drive file=/dev/sdb,if=none,id=XXX -device scsi-block,drive=XXX -enable-kvm -bios ~/seabios/out/bios.bin -serial file:out.txt lsi_scsi: error: Multiple IO pending for request 0x7fd1075bf100 qemu-system-x86_64: /home/fredianoz/qemu/hw/lsi53c895a.c:774: lsi_do_command: Assertion `s-current == ((void *)0)' failed. (sometimes I don't get the Multiple IO ending line). I'm using a recent SeaBIOS which support booting from LSI SCSI. Qemu version $ git branch -v * master 61dc008 Revert audio: Make PC speaker audio card available by default I'm using SeaBIOS commit 9d6bac1d32b72cdf7c0ad009c1371a2e69084de3 (some minor changes in order to support 4k sectors). Adding some debugging to SeaBIOS lsi code seems that drivers send initial INQUIRY request but after that all requests have some problems and lead to a lot of reset command. Frediano
[Qemu-devel] automatic exit on memory errors
Hi, I just realized that on any out of memory conditions Qemu exit with an error. I found this thread on g_malloc http://comments.gmane.org/gmane.comp.emulators.qemu/128863 but nothing related to the exit conditions. I know that out of memory conditions is quite hard to handle but if somebody rely to a virtual machine and at least to a clean close I think this won't happen. Personally I think that managing this situation in a clean way where error is tried to be reported and cleans are tried. If some function try to allocate a large buffer the function should just return an error and program can report error and let use clean or try to recover (like powering off the machine) but calling g_malloc everywhere is not that fine. It would at least be fine if block drivers register some atexit function to flush every buffer they possible have. Anybody considered this problem? Frediano
[Qemu-devel] [PATCH] make qemu_event_handle static
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- main-loop.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/main-loop.c b/main-loop.c index db23de0..fc738d1 100644 --- a/main-loop.c +++ b/main-loop.c @@ -164,7 +164,7 @@ static int qemu_signal_init(void) #else /* _WIN32 */ -HANDLE qemu_event_handle = NULL; +static HANDLE qemu_event_handle = NULL; static void dummy_event_handler(void *opaque) { -- 1.7.4.1
[Qemu-devel] Adding size to snapshot
Hi, did you add disk size to snapshot in order to support resizing of snapshot? I remember we discussed about some months ago. Regards Frediano
[Qemu-devel] [PATCH] qcow2: fix 028 iotest
This fix bound check bug accessing last cluster if image size is not cluster aligned caused by Unlock during COW patch. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c |8 ++-- block/qcow2.c |2 +- block/qcow2.h |2 ++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c9a1b1c..78e1b49 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -295,16 +295,20 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, BDRVQcowState *s = bs-opaque; int n, ret; void *buf; +QEMUIOVector qiov; +struct iovec iov; n = n_end - n_start; if (n = 0) { return 0; } -buf = qemu_blockalign(bs, n * BDRV_SECTOR_SIZE); +iov.iov_base = buf = qemu_blockalign(bs, n * BDRV_SECTOR_SIZE); +iov.iov_len = n * BDRV_SECTOR_SIZE; +qemu_iovec_init_external(qiov, iov, 1); BLKDBG_EVENT(bs-file, BLKDBG_COW_READ); -ret = bdrv_read(bs, start_sect + n_start, buf, n); +ret = qcow2_co_readv(bs, start_sect + n_start, n, qiov); if (ret 0) { goto out; } diff --git a/block/qcow2.c b/block/qcow2.c index 510ff68..fd90881 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -376,7 +376,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, +int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; diff --git a/block/qcow2.h b/block/qcow2.h index 531af39..7f5ff42 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -176,6 +176,8 @@ static inline int64_t align_offset(int64_t offset, int n) /* qcow2.c functions */ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); +int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, +int remaining_sectors, QEMUIOVector *qiov); /* qcow2-refcount.c functions */ int qcow2_refcount_init(BlockDriverState *bs); -- 1.7.1
[Qemu-devel] [PATCH] core: remove qemu_service_io
qemu_service_io was mainly an alias to qemu_notify_event, currently used only by PPC for timer hack, so call qemu_notify_event directly. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- arch_init.c |5 - qemu-common.h|3 --- qemu-tool.c |4 target-ppc/kvm_ppc.c |2 +- 4 files changed, 1 insertions(+), 13 deletions(-) diff --git a/arch_init.c b/arch_init.c index 9a5a0e3..a6c69c7 100644 --- a/arch_init.c +++ b/arch_init.c @@ -459,11 +459,6 @@ int ram_load(QEMUFile *f, void *opaque, int version_id) return 0; } -void qemu_service_io(void) -{ -qemu_notify_event(); -} - #ifdef HAS_AUDIO struct soundhw { const char *name; diff --git a/qemu-common.h b/qemu-common.h index 404c421..5e87bdf 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -276,9 +276,6 @@ void cpu_exec_init_all(void); void cpu_save(QEMUFile *f, void *opaque); int cpu_load(QEMUFile *f, void *opaque, int version_id); -/* Force QEMU to stop what it's doing and service IO */ -void qemu_service_io(void); - /* Force QEMU to process pending events */ void qemu_notify_event(void); diff --git a/qemu-tool.c b/qemu-tool.c index eb89fe0..e9f7fe1 100644 --- a/qemu-tool.c +++ b/qemu-tool.c @@ -29,10 +29,6 @@ struct QEMUBH void *opaque; }; -void qemu_service_io(void) -{ -} - Monitor *cur_mon; int monitor_cur_is_qmp(void) diff --git a/target-ppc/kvm_ppc.c b/target-ppc/kvm_ppc.c index 867dc1d..c031fcb 100644 --- a/target-ppc/kvm_ppc.c +++ b/target-ppc/kvm_ppc.c @@ -88,7 +88,7 @@ void kvmppc_fdt_update(void *fdt) static void kvmppc_timer_hack(void *opaque) { -qemu_service_io(); +qemu_notify_event(); qemu_mod_timer(kvmppc_timer, qemu_get_clock_ns(vm_clock) + kvmppc_timer_rate); } -- 1.7.1
Re: [Qemu-devel] [PATCH] qcow2: Unlock during COW
2011/9/19 Kevin Wolf kw...@redhat.com: Unlocking during COW allows for more parallelism. One change it requires is that buffers are dynamically allocated instead of just using a per-image buffer. While touching the code, drop the synchronous qcow2_read() function and replace it by a bdrv_read() call. Signed-off-by: Kevin Wolf kw...@redhat.com --- block/qcow2-cluster.c | 95 +--- 1 files changed, 26 insertions(+), 69 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c79323a..61b8e62 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -289,89 +289,42 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, } } - -static int qcow2_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVQcowState *s = bs-opaque; - int ret, index_in_cluster, n, n1; - uint64_t cluster_offset; - struct iovec iov; - QEMUIOVector qiov; - - while (nb_sectors 0) { - n = nb_sectors; - - ret = qcow2_get_cluster_offset(bs, sector_num 9, n, - cluster_offset); - if (ret 0) { - return ret; - } - - index_in_cluster = sector_num (s-cluster_sectors - 1); - if (!cluster_offset) { - if (bs-backing_hd) { - /* read from the base image */ - iov.iov_base = buf; - iov.iov_len = n * 512; - qemu_iovec_init_external(qiov, iov, 1); - - n1 = qcow2_backing_read1(bs-backing_hd, qiov, sector_num, n); - if (n1 0) { - BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING); - ret = bdrv_read(bs-backing_hd, sector_num, buf, n1); - if (ret 0) - return -1; - } - } else { - memset(buf, 0, 512 * n); - } - } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { - if (qcow2_decompress_cluster(bs, cluster_offset) 0) - return -1; - memcpy(buf, s-cluster_cache + index_in_cluster * 512, 512 * n); - } else { - BLKDBG_EVENT(bs-file, BLKDBG_READ); - ret = bdrv_pread(bs-file, cluster_offset + index_in_cluster * 512, buf, n * 512); - if (ret != n * 512) - return -1; - if (s-crypt_method) { - qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0, - s-aes_decrypt_key); - } - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - return 0; -} - static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, uint64_t cluster_offset, int n_start, int n_end) { BDRVQcowState *s = bs-opaque; int n, ret; + void *buf; n = n_end - n_start; - if (n = 0) + if (n = 0) { return 0; + } + + buf = qemu_blockalign(bs, n * BDRV_SECTOR_SIZE); + BLKDBG_EVENT(bs-file, BLKDBG_COW_READ); - ret = qcow2_read(bs, start_sect + n_start, s-cluster_data, n); - if (ret 0) - return ret; + ret = bdrv_read(bs, start_sect + n_start, buf, n); + if (ret 0) { + goto out; + } + if (s-crypt_method) { qcow2_encrypt_sectors(s, start_sect + n_start, - s-cluster_data, - s-cluster_data, n, 1, + buf, buf, n, 1, s-aes_encrypt_key); } + BLKDBG_EVENT(bs-file, BLKDBG_COW_WRITE); - ret = bdrv_write(bs-file, (cluster_offset 9) + n_start, - s-cluster_data, n); - if (ret 0) - return ret; - return 0; + ret = bdrv_write(bs-file, (cluster_offset 9) + n_start, buf, n); + if (ret 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(buf); + return ret; } @@ -620,7 +573,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) start_sect = (m-offset ~(s-cluster_size - 1)) 9; if (m-n_start) { cow = true; + qemu_co_mutex_unlock(s-lock); ret = copy_sectors(bs, start_sect, cluster_offset, 0, m-n_start); + qemu_co_mutex_lock(s-lock); if (ret 0) goto err; } @@ -628,8 +583,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) if (m-nb_available (s-cluster_sectors - 1)) { uint64_t end = m-nb_available ~(uint64_t)(s-cluster_sectors - 1); cow = true; + qemu_co_mutex_unlock(s-lock); ret = copy_sectors(bs, start_sect + end, cluster_offset + (end 9), m-nb_available - end, s-cluster_sectors); + qemu_co_mutex_lock(s-lock); if (ret 0) goto err; } -- 1.7.6.2 Seems good, I like the idea
[Qemu-devel] [PATCH 0/2][RFC?] Remove SIGUSR2 from posix-aio
Now that iothread is always compiled sending a signal seems only an additional step. This patch also avoid writing to two pipe (one from signal and one in qemu_service_io). Tested and works correctly with KVM enabled. Performances are only sligthly better (as I expected). strace output is more readable. Doubts: - any sense having two patches and not only last one? - comment is perhaps wrong as is also affect main core - is ok if KVM is disabled? more testing required Frediano Ziglio (2): block: avoid storing a constant in qemu_paiocb structure block: avoid SIGUSR2 cpus.c |5 - posix-aio-compat.c | 17 - 2 files changed, 4 insertions(+), 18 deletions(-)
[Qemu-devel] [PATCH 1/2] block: avoid storing a constant in qemu_paiocb structure
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- posix-aio-compat.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 3193dbf..7ea63a1 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -42,7 +42,6 @@ struct qemu_paiocb { int aio_niov; size_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ -int ev_signo; off_t aio_offset; QTAILQ_ENTRY(qemu_paiocb) node; @@ -381,7 +380,7 @@ static void *aio_thread(void *unused) aiocb-ret = ret; mutex_unlock(lock); -if (kill(pid, aiocb-ev_signo)) die(kill failed); +if (kill(pid, SIGUSR2)) die(kill failed); } cur_threads--; @@ -623,7 +622,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, return NULL; acb-aio_type = type; acb-aio_fildes = fd; -acb-ev_signo = SIGUSR2; if (qiov) { acb-aio_iov = qiov-iov; @@ -651,7 +649,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, return NULL; acb-aio_type = QEMU_AIO_IOCTL; acb-aio_fildes = fd; -acb-ev_signo = SIGUSR2; acb-aio_offset = 0; acb-aio_ioctl_buf = buf; acb-aio_ioctl_cmd = req; -- 1.7.1
[Qemu-devel] [PATCH 2/2] block: avoid SIGUSR2
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- cpus.c |5 - posix-aio-compat.c | 14 -- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/cpus.c b/cpus.c index 54c188c..d0cfe91 100644 --- a/cpus.c +++ b/cpus.c @@ -380,11 +380,6 @@ static int qemu_signal_init(void) int sigfd; sigset_t set; -/* SIGUSR2 used by posix-aio-compat.c */ -sigemptyset(set); -sigaddset(set, SIGUSR2); -pthread_sigmask(SIG_UNBLOCK, set, NULL); - /* * SIG_IPI must be blocked in the main thread and must not be caught * by sigwait() in the signal thread. Otherwise, the cpu thread will diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 7ea63a1..72c6dbb 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -308,6 +308,8 @@ static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb) return nbytes; } +static void posix_aio_notify_event(void); + static void *aio_thread(void *unused) { pid_t pid; @@ -380,7 +382,7 @@ static void *aio_thread(void *unused) aiocb-ret = ret; mutex_unlock(lock); -if (kill(pid, SIGUSR2)) die(kill failed); +posix_aio_notify_event(); } cur_threads--; @@ -547,7 +549,7 @@ static int posix_aio_flush(void *opaque) static PosixAioState *posix_aio_state; -static void aio_signal_handler(int signum) +static void posix_aio_notify_event(void) { if (posix_aio_state) { char byte = 0; @@ -557,8 +559,6 @@ static void aio_signal_handler(int signum) if (ret 0 errno != EAGAIN) die(write()); } - -qemu_service_io(); } static void paio_remove(struct qemu_paiocb *acb) @@ -662,7 +662,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, int paio_init(void) { -struct sigaction act; PosixAioState *s; int fds[2]; int ret; @@ -672,11 +671,6 @@ int paio_init(void) s = g_malloc(sizeof(PosixAioState)); -sigfillset(act.sa_mask); -act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ -act.sa_handler = aio_signal_handler; -sigaction(SIGUSR2, act, NULL); - s-first_aio = NULL; if (qemu_pipe(fds) == -1) { fprintf(stderr, failed to create pipe\n); -- 1.7.1
[Qemu-devel] [PATCH v2] block: avoid SIGUSR2
Now that iothread is always compiled sending a signal seems only an additional step. This patch also avoid writing to two pipe (one from signal and one in qemu_service_io). Work with kvm enabled or disabled. strace output is more readable (less syscalls). Signed-off-by: Frediano Ziglio fredd...@gmail.com --- cpus.c |5 - posix-aio-compat.c | 29 + 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/cpus.c b/cpus.c index 54c188c..d0cfe91 100644 --- a/cpus.c +++ b/cpus.c @@ -380,11 +380,6 @@ static int qemu_signal_init(void) int sigfd; sigset_t set; -/* SIGUSR2 used by posix-aio-compat.c */ -sigemptyset(set); -sigaddset(set, SIGUSR2); -pthread_sigmask(SIG_UNBLOCK, set, NULL); - /* * SIG_IPI must be blocked in the main thread and must not be caught * by sigwait() in the signal thread. Otherwise, the cpu thread will diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 3193dbf..185d5b2 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -42,7 +42,6 @@ struct qemu_paiocb { int aio_niov; size_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ -int ev_signo; off_t aio_offset; QTAILQ_ENTRY(qemu_paiocb) node; @@ -309,6 +308,8 @@ static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb) return nbytes; } +static void posix_aio_notify_event(void); + static void *aio_thread(void *unused) { pid_t pid; @@ -381,7 +382,7 @@ static void *aio_thread(void *unused) aiocb-ret = ret; mutex_unlock(lock); -if (kill(pid, aiocb-ev_signo)) die(kill failed); +posix_aio_notify_event(); } cur_threads--; @@ -548,18 +549,14 @@ static int posix_aio_flush(void *opaque) static PosixAioState *posix_aio_state; -static void aio_signal_handler(int signum) +static void posix_aio_notify_event(void) { -if (posix_aio_state) { -char byte = 0; -ssize_t ret; - -ret = write(posix_aio_state-wfd, byte, sizeof(byte)); -if (ret 0 errno != EAGAIN) -die(write()); -} +char byte = 0; +ssize_t ret; -qemu_service_io(); +ret = write(posix_aio_state-wfd, byte, sizeof(byte)); +if (ret 0 errno != EAGAIN) +die(write()); } static void paio_remove(struct qemu_paiocb *acb) @@ -623,7 +620,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, return NULL; acb-aio_type = type; acb-aio_fildes = fd; -acb-ev_signo = SIGUSR2; if (qiov) { acb-aio_iov = qiov-iov; @@ -651,7 +647,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, return NULL; acb-aio_type = QEMU_AIO_IOCTL; acb-aio_fildes = fd; -acb-ev_signo = SIGUSR2; acb-aio_offset = 0; acb-aio_ioctl_buf = buf; acb-aio_ioctl_cmd = req; @@ -665,7 +660,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, int paio_init(void) { -struct sigaction act; PosixAioState *s; int fds[2]; int ret; @@ -675,11 +669,6 @@ int paio_init(void) s = g_malloc(sizeof(PosixAioState)); -sigfillset(act.sa_mask); -act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ -act.sa_handler = aio_signal_handler; -sigaction(SIGUSR2, act, NULL); - s-first_aio = NULL; if (qemu_pipe(fds) == -1) { fprintf(stderr, failed to create pipe\n); -- 1.7.1
[Qemu-devel] [PATCH] removed usused offset variable
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- posix-aio-compat.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 3193dbf..63a8fae 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -181,7 +181,6 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) { -size_t offset = 0; ssize_t len; do { @@ -189,12 +188,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) len = qemu_pwritev(aiocb-aio_fildes, aiocb-aio_iov, aiocb-aio_niov, - aiocb-aio_offset + offset); + aiocb-aio_offset); else len = qemu_preadv(aiocb-aio_fildes, aiocb-aio_iov, aiocb-aio_niov, - aiocb-aio_offset + offset); + aiocb-aio_offset); } while (len == -1 errno == EINTR); if (len == -1) -- 1.7.1
Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
2011/9/14 Kevin Wolf kw...@redhat.com: ... But let's measure the effects first, I suspect that for cluster allocation it doesn't help much because every REF- comes with a REF+. That's 50% of effort if REF- clusters are far from REF+ :) I would expect that the next REF+ allocates exactly the REF- cluster. But you still have a point, we save the write on REF- and combine it with the REF+ write. This is still a TODO for REF+ patch. Actually, I was talking about the qcow2_cache_entry_mark_dirty_wb() case without any other change. You get it automatically then. No, I forgot I already thought about this. REF- cluster get not reused on normal operation, but reach 0 only during snapshot delete. This cause if it reach 0 it means that it was 1 but if it was 1 copied flag would be 1 and so there is no reason to decrement counter (reductio ab asburdum). Frediano
Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
2011/9/14 Kevin Wolf kw...@redhat.com: Am 13.09.2011 15:36, schrieb Frediano Ziglio: 2011/9/13 Kevin Wolf kw...@redhat.com: Am 13.09.2011 09:53, schrieb Frediano Ziglio: These patches try to trade-off between leaks and speed for clusters refcounts. Refcount increments (REF+ or refp) are handled in a different way from decrements (REF- or refm). The reason it that posting or not flushing a REF- cause just a leak while posting a REF+ cause a corruption. To optimize REF- I just used an array to store offsets then when a flush is requested or array reach a limit (currently 1022) the array is sorted and written to disk. I use an array with offset instead of ranges to support compression (an offset could appear multiple times in the array). I consider this patch quite ready. Ok, first of all let's clarify what this optimises. I don't think it changes anything at all for the writeback cache modes, because these already do most operations in memory only. So this must be about optimising some operations with cache=writethrough. REF- isn't about normal cluster allocation, it is about COW with internal snapshots or bdrv_discard. Do you have benchmarks for any of them? I strongly disagree with your approach for REF-. We already have a cache, and introducing a second one sounds like a bad idea. I think we could get a very similar effect if we introduced a qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as dirty, but at the same time tells the cache that even in write-through mode it can still treat this block as write-back. This should require much less code changes. Yes, mainly optimize for writethrough. I did not test with writeback but should improve even this (I think here you have some flush to keep consistency). I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it. Great, thanks! Don't expect however the patch too soon, I'm quite busy in these days. But let's measure the effects first, I suspect that for cluster allocation it doesn't help much because every REF- comes with a REF+. That's 50% of effort if REF- clusters are far from REF+ :) I would expect that the next REF+ allocates exactly the REF- cluster. But you still have a point, we save the write on REF- and combine it with the REF+ write. This is still a TODO for REF+ patch. Oh... time ago looking at refcount code I realize that a single deallocation could be reused in some cases only after Qemu restart. For instance - got a single cluster REF- which take refcount to 0 - free_cluster_index get decreased to this index - we get a new cluster request for 2 clusters - free_cluster_index get increased we skip freed deallocation and if we don't get a new deallocation for a cluster with index minor to our freed cluster this cluster is not reused. (I didn't test this behavior, no leak, no corruption, just image could be larger then expected) To optimize REF+ I mark a range as allocated and use this range to get new ones (avoiding writing refcount to disk). When a flush is requested or in some situations (like snapshot) this cache is disabled and flushed (written as REF-). I do not consider this patch ready, it works and pass all io-tests but for instance I would avoid allocating new clusters for refcount during preallocation. The only question here is if improving cache=writethrough cluster allocation performance is worth the additional complexity in the already complex refcounting code. I didn't see this optimization as a second level cache, but yes, for REF- is a second cache. The alternative that was discussed before is the dirty bit approach that is used in QED and would allow us to use writeback for all refcount blocks, regardless of REF- or REF+. It would be an easier approach requiring less code changes, but it comes with the cost of requiring an fsck after a qemu crash. I was thinking about changing the header magic first time we change refcount in order to mark image as dirty so newer Qemu recognize the flag while former one does not recognize image. Obviously reverting magic on image close. We've discussed this idea before and I think it wasn't considered a great idea to automagically change the header in an incompatible way. But we can always say that for improved performance you need to upgrade your image to qcow2 v3. I don't understand why there is not a wiki page for detailed qcow3 changes. I saw your post on May. I follow this ML since August so I think I missed a lot of discussion on qcow improves. End speed up is quite visible allocating clusters (more then 20%). What benchmark do you use for testing this? Kevin Currently I'm using bonnie++ but I noted similar improves with iozone. The test script format an image then launch a Linux machine which run a script and save result to a file. The test image is seems by this virtual machine as a separate disk. The file on hist reside in a separate LV. I got quite
Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
2011/9/14 Kevin Wolf kw...@redhat.com: ... omissis... To optimize REF+ I mark a range as allocated and use this range to get new ones (avoiding writing refcount to disk). When a flush is requested or in some situations (like snapshot) this cache is disabled and flushed (written as REF-). I do not consider this patch ready, it works and pass all io-tests but for instance I would avoid allocating new clusters for refcount during preallocation. The only question here is if improving cache=writethrough cluster allocation performance is worth the additional complexity in the already complex refcounting code. I didn't see this optimization as a second level cache, but yes, for REF- is a second cache. The alternative that was discussed before is the dirty bit approach that is used in QED and would allow us to use writeback for all refcount blocks, regardless of REF- or REF+. It would be an easier approach requiring less code changes, but it comes with the cost of requiring an fsck after a qemu crash. I was thinking about changing the header magic first time we change refcount in order to mark image as dirty so newer Qemu recognize the flag while former one does not recognize image. Obviously reverting magic on image close. We've discussed this idea before and I think it wasn't considered a great idea to automagically change the header in an incompatible way. But we can always say that for improved performance you need to upgrade your image to qcow2 v3. I don't understand why there is not a wiki page for detailed qcow3 changes. I saw your post on May. I follow this ML since August so I think I missed a lot of discussion on qcow improves. Unfortunately there have been almost no comments, so you can consider RFC v2 as the current proposal. :( End speed up is quite visible allocating clusters (more then 20%). What benchmark do you use for testing this? Kevin Currently I'm using bonnie++ but I noted similar improves with iozone. The test script format an image then launch a Linux machine which run a script and save result to a file. The test image is seems by this virtual machine as a separate disk. The file on hist reside in a separate LV. I got quite consistent results (of course not working on the machine while testing, is not actually dedicated to this job). Actually I'm running the test (added a test working in a snapshot image). Okay. Let me guess the remaining variables: The image is on an ext4 host filesystem, you use cache=writethrough and virtio-blk. You don't use backing files, compression and encryption. For your tests with internal snapshots you have exactly one internal snapshot that is taken immediately before the benchmark. Oh, and not to forget, KVM is enabled. Are these assumptions correct? change ext4 and put xfs and assumptions are ok. Yes I use internal snapshots (REF- are useful only in this case). To produce qcow2s I use these commands $QEMU_IMG create -f qcow2 -o preallocation=metadata $FILE 15g $QEMU_IMG snapshot -c test $FILE Ah, using preallocation for this is a nice trick. :-) Anyway, thanks, I think I understand now what you're measuring. Yes, and real performance are even worst cause preallocating only metadata you get many less physical read from disk. I run again tests (yesterdays test was not that consistence, today I stopped all unneeded services and avoid the X session tunneled by ssh). without patched run raw qcow2 qcow2s 1 22758 4206 1054 2 28401 21745 17997 with patches (ref-,ref+) run raw qcow2 qcow2s 1 22563 4965 4382 2 23823 21043 20904 beside I still don't understand difference in second runs for raw (about 20%!!!) this confirms the huge improves with snapshot. Measuring with cache=writethrough means that you use the host page cache, so I think that could be where you get the difference for raw. Numbers should be more consistent for cache=none, but that's not what we want to test... Did you also test how each patch performs if applied on its own, without the other one? It would be interesting to see how much REF- optimisation contributes and how much REF+. Kevin Here you are only ref- run rawqcow2 qcow2s 1 22579 40724053 2 30322 19667 16830 so mostly REF- for snapshot. Frediano
[Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
These patches try to trade-off between leaks and speed for clusters refcounts. Refcount increments (REF+ or refp) are handled in a different way from decrements (REF- or refm). The reason it that posting or not flushing a REF- cause just a leak while posting a REF+ cause a corruption. To optimize REF- I just used an array to store offsets then when a flush is requested or array reach a limit (currently 1022) the array is sorted and written to disk. I use an array with offset instead of ranges to support compression (an offset could appear multiple times in the array). I consider this patch quite ready. To optimize REF+ I mark a range as allocated and use this range to get new ones (avoiding writing refcount to disk). When a flush is requested or in some situations (like snapshot) this cache is disabled and flushed (written as REF-). I do not consider this patch ready, it works and pass all io-tests but for instance I would avoid allocating new clusters for refcount during preallocation. End speed up is quite visible allocating clusters (more then 20%). Frediano Ziglio (2): qcow2: optimize refminus updates qcow2: ref+ optimization block/qcow2-refcount.c | 270 +--- block/qcow2.c |2 + block/qcow2.h | 16 +++ 3 files changed, 275 insertions(+), 13 deletions(-)
[Qemu-devel] [PATCH][RFC][1/2] qcow2: optimize refminus updates
Cache refcount decrement in an array to trade-off between leaks and speed. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c | 142 ++-- block/qcow2.c |1 + block/qcow2.h | 14 + 3 files changed, 153 insertions(+), 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 9605367..7d59b68 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -40,6 +40,13 @@ int qcow2_refcount_init(BlockDriverState *bs) BDRVQcowState *s = bs-opaque; int ret, refcount_table_size2, i; +s-refm_cache_index = 0; +s-refm_cache_len = 1024; +s-refm_cache = g_malloc(s-refm_cache_len * sizeof(uint64)); +if (!s-refm_cache) { +goto fail; +} + refcount_table_size2 = s-refcount_table_size * sizeof(uint64_t); s-refcount_table = g_malloc(refcount_table_size2); if (s-refcount_table_size 0) { @@ -53,12 +60,14 @@ int qcow2_refcount_init(BlockDriverState *bs) } return 0; fail: +g_free(s-refm_cache); return -ENOMEM; } void qcow2_refcount_close(BlockDriverState *bs) { BDRVQcowState *s = bs-opaque; +g_free(s-refm_cache); g_free(s-refcount_table); } @@ -634,13 +643,21 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) void qcow2_free_clusters(BlockDriverState *bs, int64_t offset, int64_t size) { +BDRVQcowState *s = bs-opaque; int ret; +int64_t start, last; BLKDBG_EVENT(bs-file, BLKDBG_CLUSTER_FREE); -ret = update_refcount(bs, offset, size, -1); -if (ret 0) { -fprintf(stderr, qcow2_free_clusters failed: %s\n, strerror(-ret)); -/* TODO Remember the clusters to free them later and avoid leaking */ +start = offset ~(s-cluster_size - 1); +last = (offset + size - 1) ~(s-cluster_size - 1); +for (; start = last; start += s-cluster_size) { +ret = qcow2_refm_add(bs, start); +if (ret 0) { +fprintf(stderr, qcow2_free_clusters failed: %s\n, strerror(-ret)); +/* TODO Remember the clusters to free them later + * and avoid leaking */ +break; +} } } @@ -1165,3 +1182,120 @@ fail: return ret; } +int qcow2_refm_add_any(BlockDriverState *bs, int64_t offset) +{ +BDRVQcowState *s = bs-opaque; + +offset = ~QCOW_OFLAG_COPIED; +if (s-refm_cache_index + 2 s-refm_cache_len) { +int ret = qcow2_refm_flush(bs); +if (ret 0) { +return ret; +} +} + +if ((offset QCOW_OFLAG_COMPRESSED)) { +int nb_csectors = ((offset s-csize_shift) s-csize_mask) + 1; +int64_t last; + +offset = (offset s-cluster_offset_mask) ~511; +last = offset + nb_csectors * 512 - 1; +if (!in_same_refcount_block(s, offset, last)) { +s-refm_cache[s-refm_cache_index++] = last; +} +} +s-refm_cache[s-refm_cache_index++] = offset; +return 0; +} + +static int uint64_cmp(const void *a, const void *b) +{ +#define A (*((const uint64_t *)a)) +#define B (*((const uint64_t *)b)) +if (A == B) { +return 0; +} +return A B ? 1 : -1; +#undef A +#undef B +} + +int qcow2_refm_flush(BlockDriverState *bs) +{ +BDRVQcowState *s = bs-opaque; +uint16_t *refcount_block = NULL; +int64_t old_table_index = -1; +int ret, i, saved_index = 0; +int len = s-refm_cache_index; + +/* sort cache */ +qsort(s-refm_cache, len, sizeof(uint64_t), uint64_cmp); + +/* save */ +for (i = 0; i len; ++i) { +uint64_t cluster_offset = s-refm_cache[i]; +int block_index, refcount; +int64_t cluster_index = cluster_offset s-cluster_bits; +int64_t table_index = +cluster_index (s-cluster_bits - REFCOUNT_SHIFT); + +/* Load the refcount block and allocate it if needed */ +if (table_index != old_table_index) { +if (refcount_block) { +ret = qcow2_cache_put(bs, s-refcount_block_cache, +(void **) refcount_block); +if (ret 0) { +goto fail; +} +saved_index = i; +refcount_block = NULL; +} + +ret = alloc_refcount_block(bs, cluster_index, refcount_block); +if (ret 0) { +goto fail; +} +} +old_table_index = table_index; + +qcow2_cache_entry_mark_dirty(s-refcount_block_cache, refcount_block); + +/* we can update the count and save it */ +block_index = cluster_index +((1 (s-cluster_bits - REFCOUNT_SHIFT)) - 1); + +refcount = be16_to_cpu(refcount_block[block_index]); +refcount--; +if (refcount 0) { +ret = -EINVAL; +goto fail; +} +if (refcount == 0 cluster_index s-free_cluster_index) { +s
[Qemu-devel] [PATCH][RFC][2/2] qcow2: ref+ optimization
preallocate multiple refcount increment in order to collapse allocation writes. This cause leaks in case of Qemu crash but no corruptions. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c | 128 --- block/qcow2.c |1 + block/qcow2.h |2 + 3 files changed, 122 insertions(+), 9 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 7d59b68..3792cda 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -30,6 +30,7 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, int addend); +static void qcow2_refp_enable(BlockDriverState *bs); /*/ @@ -117,6 +118,12 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) ((1 (s-cluster_bits - REFCOUNT_SHIFT)) - 1); refcount = be16_to_cpu(refcount_block[block_index]); +/* ignore preallocation */ +if (cluster_index = s-refp_prealloc_begin + cluster_index s-refp_prealloc_end) { +--refcount; +} + ret = qcow2_cache_put(bs, s-refcount_block_cache, (void**) refcount_block); if (ret 0) { @@ -207,6 +214,10 @@ static int alloc_refcount_block(BlockDriverState *bs, * refcount block into the cache */ +uint64_t old_free_cluster_index = s-free_cluster_index; +qcow2_refp_flush(bs); +s-free_cluster_index = old_free_cluster_index; + *refcount_block = NULL; /* We write to the refcount table, so we might depend on L2 tables */ @@ -215,6 +226,7 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Allocate the refcount block itself and mark it as used */ int64_t new_block = alloc_clusters_noref(bs, s-cluster_size); if (new_block 0) { +qcow2_refp_enable(bs); return new_block; } @@ -279,6 +291,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } s-refcount_table[refcount_table_index] = new_block; +qcow2_refp_enable(bs); return 0; } @@ -400,10 +413,11 @@ static int alloc_refcount_block(BlockDriverState *bs, s-refcount_table_offset = table_offset; /* Free old table. Remember, we must not change free_cluster_index */ -uint64_t old_free_cluster_index = s-free_cluster_index; +old_free_cluster_index = s-free_cluster_index; qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); s-free_cluster_index = old_free_cluster_index; +qcow2_refp_enable(bs); ret = load_refcount_block(bs, new_block, (void**) refcount_block); if (ret 0) { return ret; @@ -417,6 +431,7 @@ fail_block: if (*refcount_block != NULL) { qcow2_cache_put(bs, s-refcount_block_cache, (void**) refcount_block); } +qcow2_refp_enable(bs); return ret; } @@ -529,9 +544,23 @@ static int update_cluster_refcount(BlockDriverState *bs, BDRVQcowState *s = bs-opaque; int ret; -ret = update_refcount(bs, cluster_index s-cluster_bits, 1, addend); -if (ret 0) { -return ret; +/* handle preallocation */ +if (cluster_index = s-refp_prealloc_begin + cluster_index s-refp_prealloc_end) { + +/* free previous (should never happen) */ +int64_t index = s-refp_prealloc_begin; +for (; index cluster_index; ++index) { +qcow2_refm_add(bs, index s-cluster_bits); +} +addend--; +s-refp_prealloc_begin = cluster_index + 1; +} +if (addend) { +ret = update_refcount(bs, cluster_index s-cluster_bits, 1, addend); +if (ret 0) { +return ret; +} } bdrv_flush(bs-file); @@ -572,20 +601,94 @@ retry: return (s-free_cluster_index - nb_clusters) s-cluster_bits; } +static void qcow2_refp_enable(BlockDriverState *bs) +{ +BDRVQcowState *s = bs-opaque; + +if (s-refp_prealloc_end 0) { +/* enable again ? */ +if (++s-refp_prealloc_end == 0) { +s-refp_prealloc_end = +s-refp_prealloc_begin; +} +} +} + +int qcow2_refp_flush(BlockDriverState *bs) +{ +BDRVQcowState *s = bs-opaque; +int64_t index, end = s-refp_prealloc_end; + +if (end 0) { +s-refp_prealloc_end = end - 1; +return 0; +} + +index = s-refp_prealloc_begin; +/* this disable next allocations */ +s-refp_prealloc_end = -1; +for (; index end; ++index) { +qcow2_refm_add(bs, index s-cluster_bits); +} +qcow2_refm_flush(bs); +return 0; +} + int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) { -int64_t offset; -int ret; +BDRVQcowState *s = bs-opaque; +int64_t offset, cluster_index; +int ret, nb_clusters
Re: [Qemu-devel] [PATCH] qcow2: fix range check
2011/9/12 Kevin Wolf kw...@redhat.com: Am 10.09.2011 10:23, schrieb Frediano Ziglio: QCowL2Meta::offset is not cluster aligned but only sector aligned however nb_clusters count cluster from cluster start. This fix range check. Note that old code have no corruption issues related to this check cause it only cause intersection to occur when shouldn't. Are you sure? See below. (I think it doesn't corrupt the image, but for a different reason) Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c | 14 +++--- 1 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 428b5ad..2f76311 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -776,17 +776,17 @@ again: */ QLIST_FOREACH(old_alloc, s-cluster_allocs, next_in_flight) { - uint64_t end_offset = offset + nb_clusters * s-cluster_size; - uint64_t old_offset = old_alloc-offset; - uint64_t old_end_offset = old_alloc-offset + - old_alloc-nb_clusters * s-cluster_size; + uint64_t start = offset s-cluster_bits; + uint64_t end = start + nb_clusters; + uint64_t old_start = old_alloc-offset s-cluster_bits; + uint64_t old_end = old_start + old_alloc-nb_clusters; - if (end_offset old_offset || offset old_end_offset) { + if (end old_start || start old_end) { /* No intersection */ Consider request A from 0x0 + 0x1000 bytes and request B from 0x2000 + 0x1000 bytes. Both touch the same cluster and therefore should be serialised, but 0x2000 0x1000, so we decided here that there is no intersection and we don't have to care. Note that this doesn't corrupt the image, qcow2 can handle parallel requests allocating the same cluster. In qcow2_alloc_cluster_link_l2() we get an additional COW operation, so performance will be hurt, but correctness is maintained. I tested this adding some printf and also with strace and I can confirm that current code serialize allocation. Using ranges A (0-0x1000) and B (0x2000-0x3000) and assuming 0x1 (64k) as cluster size you get A: offset 0 nb_clusters 1 B: offset 0x2000 nb_clusters 1 So without the patch you get two ranges A: 0-0x1 B: 0x2000-0x12000 which intersects. } else { - if (offset old_offset) { + if (start old_start) { /* Stop at the start of a running allocation */ - nb_clusters = (old_offset - offset) s-cluster_bits; + nb_clusters = old_start - start; } else { nb_clusters = 0; } Anyway, the patch looks good. Applied to the block branch. Kevin Oh... I realize that ranges are [start, end) (end not inclusive) so intersection test should be if (end = old_start || start = old_end) { intead of if (end old_start || start old_end) { However I don't understand why I got some small speed penalty with this change (only done some small tests). Frediano
[Qemu-devel] qcow2: snapshot and resize possible?
Looking at block TODOs I saw qcow2 resize with snapshot. However I would ask if this is technical possible with current format. The reason is that snapshots have no size (only l1_size, QCowHeader have a size field) however I think that size if part of machine state and is not possible to compute exact size from l1_size. Should this resize be posted to qcow3 update? Frediano
Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
2011/9/13 Kevin Wolf kw...@redhat.com: Am 13.09.2011 09:53, schrieb Frediano Ziglio: These patches try to trade-off between leaks and speed for clusters refcounts. Refcount increments (REF+ or refp) are handled in a different way from decrements (REF- or refm). The reason it that posting or not flushing a REF- cause just a leak while posting a REF+ cause a corruption. To optimize REF- I just used an array to store offsets then when a flush is requested or array reach a limit (currently 1022) the array is sorted and written to disk. I use an array with offset instead of ranges to support compression (an offset could appear multiple times in the array). I consider this patch quite ready. Ok, first of all let's clarify what this optimises. I don't think it changes anything at all for the writeback cache modes, because these already do most operations in memory only. So this must be about optimising some operations with cache=writethrough. REF- isn't about normal cluster allocation, it is about COW with internal snapshots or bdrv_discard. Do you have benchmarks for any of them? I strongly disagree with your approach for REF-. We already have a cache, and introducing a second one sounds like a bad idea. I think we could get a very similar effect if we introduced a qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as dirty, but at the same time tells the cache that even in write-through mode it can still treat this block as write-back. This should require much less code changes. Yes, mainly optimize for writethrough. I did not test with writeback but should improve even this (I think here you have some flush to keep consistency). I'll try to write a qcow2_cache_entry_mark_dirty_wb patch and test it. But let's measure the effects first, I suspect that for cluster allocation it doesn't help much because every REF- comes with a REF+. That's 50% of effort if REF- clusters are far from REF+ :) To optimize REF+ I mark a range as allocated and use this range to get new ones (avoiding writing refcount to disk). When a flush is requested or in some situations (like snapshot) this cache is disabled and flushed (written as REF-). I do not consider this patch ready, it works and pass all io-tests but for instance I would avoid allocating new clusters for refcount during preallocation. The only question here is if improving cache=writethrough cluster allocation performance is worth the additional complexity in the already complex refcounting code. I didn't see this optimization as a second level cache, but yes, for REF- is a second cache. The alternative that was discussed before is the dirty bit approach that is used in QED and would allow us to use writeback for all refcount blocks, regardless of REF- or REF+. It would be an easier approach requiring less code changes, but it comes with the cost of requiring an fsck after a qemu crash. I was thinking about changing the header magic first time we change refcount in order to mark image as dirty so newer Qemu recognize the flag while former one does not recognize image. Obviously reverting magic on image close. End speed up is quite visible allocating clusters (more then 20%). What benchmark do you use for testing this? Kevin Currently I'm using bonnie++ but I noted similar improves with iozone. The test script format an image then launch a Linux machine which run a script and save result to a file. The test image is seems by this virtual machine as a separate disk. The file on hist reside in a separate LV. I got quite consistent results (of course not working on the machine while testing, is not actually dedicated to this job). Actually I'm running the test (added a test working in a snapshot image). Frediano
Re: [Qemu-devel] [PATCH][RFC][0/2] REF+/REF- optimization
2011/9/13 Kevin Wolf kw...@redhat.com: Am 13.09.2011 09:53, schrieb Frediano Ziglio: These patches try to trade-off between leaks and speed for clusters refcounts. Refcount increments (REF+ or refp) are handled in a different way from decrements (REF- or refm). The reason it that posting or not flushing a REF- cause just a leak while posting a REF+ cause a corruption. To optimize REF- I just used an array to store offsets then when a flush is requested or array reach a limit (currently 1022) the array is sorted and written to disk. I use an array with offset instead of ranges to support compression (an offset could appear multiple times in the array). I consider this patch quite ready. Ok, first of all let's clarify what this optimises. I don't think it changes anything at all for the writeback cache modes, because these already do most operations in memory only. So this must be about optimising some operations with cache=writethrough. REF- isn't about normal cluster allocation, it is about COW with internal snapshots or bdrv_discard. Do you have benchmarks for any of them? I strongly disagree with your approach for REF-. We already have a cache, and introducing a second one sounds like a bad idea. I think we could get a very similar effect if we introduced a qcow2_cache_entry_mark_dirty_wb() that marks a given refcount block as dirty, but at the same time tells the cache that even in write-through mode it can still treat this block as write-back. This should require much less code changes. But let's measure the effects first, I suspect that for cluster allocation it doesn't help much because every REF- comes with a REF+. To optimize REF+ I mark a range as allocated and use this range to get new ones (avoiding writing refcount to disk). When a flush is requested or in some situations (like snapshot) this cache is disabled and flushed (written as REF-). I do not consider this patch ready, it works and pass all io-tests but for instance I would avoid allocating new clusters for refcount during preallocation. The only question here is if improving cache=writethrough cluster allocation performance is worth the additional complexity in the already complex refcounting code. The alternative that was discussed before is the dirty bit approach that is used in QED and would allow us to use writeback for all refcount blocks, regardless of REF- or REF+. It would be an easier approach requiring less code changes, but it comes with the cost of requiring an fsck after a qemu crash. End speed up is quite visible allocating clusters (more then 20%). What benchmark do you use for testing this? Kevin Here you are some results (kb/s) with patch (ref-, ref+), qcow2s is qcow2 with a snapshot run rawqcow2 qcow2s 1 22748 4878 4792 2 29557 15839 23144 without run rawqcow2 qcow2s 1 21979 4308 1021 2 26249 13776 24182 Frediano
[Qemu-devel] [PATCH] qcow2: fix range check
QCowL2Meta::offset is not cluster aligned but only sector aligned however nb_clusters count cluster from cluster start. This fix range check. Note that old code have no corruption issues related to this check cause it only cause intersection to occur when shouldn't. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c | 14 +++--- 1 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 428b5ad..2f76311 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -776,17 +776,17 @@ again: */ QLIST_FOREACH(old_alloc, s-cluster_allocs, next_in_flight) { -uint64_t end_offset = offset + nb_clusters * s-cluster_size; -uint64_t old_offset = old_alloc-offset; -uint64_t old_end_offset = old_alloc-offset + -old_alloc-nb_clusters * s-cluster_size; +uint64_t start = offset s-cluster_bits; +uint64_t end = start + nb_clusters; +uint64_t old_start = old_alloc-offset s-cluster_bits; +uint64_t old_end = old_start + old_alloc-nb_clusters; -if (end_offset old_offset || offset old_end_offset) { +if (end old_start || start old_end) { /* No intersection */ } else { -if (offset old_offset) { +if (start old_start) { /* Stop at the start of a running allocation */ -nb_clusters = (old_offset - offset) s-cluster_bits; +nb_clusters = old_start - start; } else { nb_clusters = 0; } -- 1.7.1
[Qemu-devel] [PATCH] qcow2: align cluster_data to block to improve performance using O_DIRECT
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 14 +++--- 1 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 8aed310..510ff68 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -237,7 +237,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) s-cluster_cache = g_malloc(s-cluster_size); /* one more sector for decompressed data alignment */ -s-cluster_data = g_malloc(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size +s-cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size + 512); s-cluster_cache_offset = -1; @@ -296,7 +296,7 @@ static int qcow2_open(BlockDriverState *bs, int flags) qcow2_cache_destroy(bs, s-l2_table_cache); } g_free(s-cluster_cache); -g_free(s-cluster_data); +qemu_vfree(s-cluster_data); return ret; } @@ -456,7 +456,7 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, */ if (!cluster_data) { cluster_data = -g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); +qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } assert(cur_nr_sectors = @@ -496,7 +496,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); -g_free(cluster_data); +qemu_vfree(cluster_data); return ret; } @@ -566,7 +566,7 @@ static int qcow2_co_writev(BlockDriverState *bs, if (s-crypt_method) { if (!cluster_data) { -cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * +cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } @@ -611,7 +611,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); -g_free(cluster_data); +qemu_vfree(cluster_data); return ret; } @@ -628,7 +628,7 @@ static void qcow2_close(BlockDriverState *bs) qcow2_cache_destroy(bs, s-refcount_block_cache); g_free(s-cluster_cache); -g_free(s-cluster_data); +qemu_vfree(s-cluster_data); qcow2_refcount_close(bs); } -- 1.7.1
[Qemu-devel] [PATCH] qcow2: initialize metadata before inserting in cluster_allocs
QCow2Meta structure was inserted into list before many fields are initialized. Currently is not a problem cause all occur in a lock but if qcow2_alloc_clusters would in a future unlock this lock some issues could arise. Initializing fields before inserting fix the problem. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c | 10 +- 1 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 113db8b..428b5ad 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -806,6 +806,11 @@ again: abort(); } +/* save info needed for meta data update */ +m-offset = offset; +m-n_start = n_start; +m-nb_clusters = nb_clusters; + QLIST_INSERT_HEAD(s-cluster_allocs, m, next_in_flight); /* allocate a new cluster */ @@ -816,11 +821,6 @@ again: goto fail; } -/* save info needed for meta data update */ -m-offset = offset; -m-n_start = n_start; -m-nb_clusters = nb_clusters; - out: ret = qcow2_cache_put(bs, s-l2_table_cache, (void**) l2_table); if (ret 0) { -- 1.7.1
[Qemu-devel] [PATCH] qcow2: removed unused depends_on field
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c |3 +-- block/qcow2.h |1 - 2 files changed, 1 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index e06be64..113db8b 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -694,7 +694,7 @@ err: * If the offset is not found, allocate a new cluster. * * If the cluster was already allocated, m-nb_clusters is set to 0, - * m-depends_on is set to NULL and the other fields in m are meaningless. + * other fields in m are meaningless. * * If the cluster is newly allocated, m-nb_clusters is set to the number of * contiguous clusters that have been allocated. In this case, the other @@ -736,7 +736,6 @@ again: cluster_offset = ~QCOW_OFLAG_COPIED; m-nb_clusters = 0; -m-depends_on = NULL; goto out; } diff --git a/block/qcow2.h b/block/qcow2.h index c8ca3bc..531af39 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -148,7 +148,6 @@ typedef struct QCowL2Meta int n_start; int nb_available; int nb_clusters; -struct QCowL2Meta *depends_on; CoQueue dependent_requests; QLIST_ENTRY(QCowL2Meta) next_in_flight; -- 1.7.1
[Qemu-devel] Suspicious code in qcow2.
Actually it does not cause problems but this code order seems a bit wrong to me (block/qcow2-cluster.c) QLIST_INSERT_HEAD(s-cluster_allocs, m, next_in_flight); /* allocate a new cluster */ cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s-cluster_size); if (cluster_offset 0) { ret = cluster_offset; goto fail; } /* save info needed for meta data update */ m-offset = offset; m-n_start = n_start; m-nb_clusters = nb_clusters; current metadata (m) get inserted in cluster allocation list with nb_clusters set to 0. Loop on cluster_allocs ignore (wait for this allocation or just skip it depending on dirty data in offset field) this metadata. Currently all occur in a CoMutex so this does not cause problems but in case qcow2_alloc_clusters unlock the mutex it can occur to insert two overlapping updates into cluster_allocs. Perhaps a better order would be /* save info needed for meta data update */ m-offset = offset; m-n_start = n_start; m-nb_clusters = nb_clusters; QLIST_INSERT_HEAD(s-cluster_allocs, m, next_in_flight); /* allocate a new cluster */ cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s-cluster_size); if (cluster_offset 0) { ret = cluster_offset; goto fail; } (tested successfully with iotests suite) Frediano
[Qemu-devel] [PATCH] rename qemu_malloc and related to glib names for coherence
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- trace-events | 10 +- vl.c |6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/trace-events b/trace-events index dc300a2..37da2e0 100644 --- a/trace-events +++ b/trace-events @@ -14,7 +14,7 @@ # # [disable] name(type1 arg1[, type2 arg2] ...) format-string # -# Example: qemu_malloc(size_t size) size %zu +# Example: g_malloc(size_t size) size %zu # # The disable keyword will build without the trace event. # In case of 'simple' trace backend, it will allow the trace event to be @@ -28,10 +28,10 @@ # # The format-string should be a sprintf()-compatible format string. -# qemu-malloc.c -disable qemu_malloc(size_t size, void *ptr) size %zu ptr %p -disable qemu_realloc(void *ptr, size_t size, void *newptr) ptr %p size %zu newptr %p -disable qemu_free(void *ptr) ptr %p +# vl.c +disable g_malloc(size_t size, void *ptr) size %zu ptr %p +disable g_realloc(void *ptr, size_t size, void *newptr) ptr %p size %zu newptr %p +disable g_free(void *ptr) ptr %p # osdep.c disable qemu_memalign(size_t alignment, size_t size, void *ptr) alignment %zu size %zu ptr %p diff --git a/vl.c b/vl.c index 9cd67a3..f71221b 100644 --- a/vl.c +++ b/vl.c @@ -2088,20 +2088,20 @@ static const QEMUOption *lookup_opt(int argc, char **argv, static gpointer malloc_and_trace(gsize n_bytes) { void *ptr = malloc(n_bytes); -trace_qemu_malloc(n_bytes, ptr); +trace_g_malloc(n_bytes, ptr); return ptr; } static gpointer realloc_and_trace(gpointer mem, gsize n_bytes) { void *ptr = realloc(mem, n_bytes); -trace_qemu_realloc(mem, n_bytes, ptr); +trace_g_realloc(mem, n_bytes, ptr); return ptr; } static void free_and_trace(gpointer mem) { -trace_qemu_free(mem); +trace_g_free(mem); free(mem); } -- 1.7.1
[Qemu-devel] [PATCH] linux aio: some comments
Add some notes about Linux AIO explaining why we don't use AIO in some situations. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/raw-posix.c |4 linux-aio.c |1 + 2 files changed, 5 insertions(+), 0 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index c5c9944..bcf50b2 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -236,6 +236,10 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } #ifdef CONFIG_LINUX_AIO +/* + * Currently Linux do AIO only for files opened with O_DIRECT + * specified so check NOCACHE flag too + */ if ((bdrv_flags (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { diff --git a/linux-aio.c b/linux-aio.c index 5fd3932..5265a02 100644 --- a/linux-aio.c +++ b/linux-aio.c @@ -181,6 +181,7 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, case QEMU_AIO_READ: io_prep_preadv(iocbs, fd, qiov-iov, qiov-niov, offset); break; +/* Currently Linux kernel does not support other operations */ default: fprintf(stderr, %s: invalid AIO request type 0x%x.\n, __func__, type); -- 1.7.1
[Qemu-devel] [PATCH] disasm: update comment
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- disas.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/disas.c b/disas.c index 1334b8e..611b30b 100644 --- a/disas.c +++ b/disas.c @@ -137,7 +137,7 @@ print_insn_thumb1(bfd_vma pc, disassemble_info *info) /* Disassemble this for me please... (debugging). 'flags' has the following values: -i386 - nonzero means 16 bit code +i386 - 1 means 16 bit code, 2 means 64 bit code arm - nonzero means thumb code ppc - nonzero means little endian other targets - unused -- 1.7.1
[Qemu-devel] [PATCH] qcow2: use always stderr for debugging
let all DEBUG_ALLOC2 printf goes to stderr Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c |2 +- block/qcow2-refcount.c |4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 9269dda..9034840 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -53,7 +53,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size, bool exact_size) } #ifdef DEBUG_ALLOC2 -printf(grow l1_table from %d to %d\n, s-l1_size, new_l1_size); +fprintf(stderr, grow l1_table from %d to %d\n, s-l1_size, new_l1_size); #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 2a915be..fbf28da 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -422,7 +422,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int ret; #ifdef DEBUG_ALLOC2 -printf(update_refcount: offset=% PRId64 size=% PRId64 addend=%d\n, +fprintf(stderr, update_refcount: offset=% PRId64 size=% PRId64 addend=%d\n, offset, length, addend); #endif if (length 0) { @@ -556,7 +556,7 @@ retry: } } #ifdef DEBUG_ALLOC2 -printf(alloc_clusters: size=% PRId64 - % PRId64 \n, +fprintf(stderr, alloc_clusters: size=% PRId64 - % PRId64 \n, size, (s-free_cluster_index - nb_clusters) s-cluster_bits); #endif -- 1.7.1
[Qemu-devel] [PATCH] qcow2: remove unused qcow2_create_refcount_update function
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c | 18 -- block/qcow2.h |2 -- 2 files changed, 0 insertions(+), 20 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 2a915be..62946bb 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -680,24 +680,6 @@ void qcow2_free_any_clusters(BlockDriverState *bs, -void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, -int64_t size) -{ -int refcount; -int64_t start, last, cluster_offset; -uint16_t *p; - -start = offset ~(s-cluster_size - 1); -last = (offset + size - 1) ~(s-cluster_size - 1); -for(cluster_offset = start; cluster_offset = last; -cluster_offset += s-cluster_size) { -p = s-refcount_block[cluster_offset s-cluster_bits]; -refcount = be16_to_cpu(*p); -refcount++; -*p = cpu_to_be16(refcount); -} -} - /* update the refcounts of snapshots and the copied flag */ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) diff --git a/block/qcow2.h b/block/qcow2.h index de23abe..c8ca3bc 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -189,8 +189,6 @@ void qcow2_free_clusters(BlockDriverState *bs, void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t cluster_offset, int nb_clusters); -void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, -int64_t size); int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend); -- 1.7.1
Re: [Qemu-devel] [PATCH 2/2] disasm: update comment
2011/8/25 Peter Maydell peter.mayd...@linaro.org: On 25 August 2011 09:44, Stefan Hajnoczi stefa...@linux.vnet.ibm.com wrote: /* Disassemble this for me please... (debugging). 'flags' has the following values: - i386 - nonzero means 16 bit code + i386 - 1 means 16 bit code, 2 means 64 bit code ...presumably 0 means 32 bit code, by elimination ? -- PMM Currently you can pass even 10 :) Yes, 0 (default) means 32 bit. Frediano
[Qemu-devel] [PATCH v3 00/15] qcow/qcow2 cleanups
These patches mostly cleanup some AIO code using coroutines. Mostly they use stack instead of allocated AIO structure. Feel free to collapse it too short. Frediano Ziglio (15): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev qcow: remove old #undefined code qcow2: removed unused fields qcow2: removed cur_nr_sectors field in QCowAIOCB qcow2: remove l2meta from QCowAIOCB qcow2: remove cluster_offset from QCowAIOCB qcow2: remove common from QCowAIOCB qcow2: reindent and use while before the big jump qcow2: removed QCowAIOCB entirely qcow2: remove memory leak qcow2: small math optimization qcow2: small optimization block/qcow.c | 378 ++-- block/qcow2-refcount.c | 16 +-- block/qcow2.c | 412 +++ 3 files changed, 294 insertions(+), 512 deletions(-)
[Qemu-devel] [PATCH v3 01/15] qcow: allocate QCowAIOCB structure using stack
instead of calling qemi_aio_get use stack Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 52 block/qcow2.c | 38 +++--- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index e155d3c..b4506b4 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -503,28 +503,12 @@ typedef struct QCowAIOCB { BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -if (acb-hd_aiocb) -bdrv_aio_cancel(acb-hd_aiocb); -qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow_aio_cancel, -}; - static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write) +int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-hd_aiocb = NULL; acb-sector_num = sector_num; acb-qiov = qiov; @@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_aio_read_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_read_cb(acb); +ret = qcow_aio_read_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } -static int qcow_aio_write_cb(void *opaque) +static int qcow_aio_write_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; s-cluster_cache_offset = -1; /* disable compressed cache */ -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_write_cb(acb); +ret = qcow_aio_write_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index bfff6cd..bb6c75e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -388,17 +388,6 @@ typedef struct QCowAIOCB { QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow2_aio_cancel, -}; - /* * Returns 0 when the request is completed successfully, 1 when there is still * a part left to do and -errno in error cases. @@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; acb-is_write = is_write; @@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int
[Qemu-devel] [PATCH v3 02/15] qcow: QCowAIOCB field cleanup
remove unused field from this structure and put some of them in qcow_aio_read_cb and qcow_aio_write_cb Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 137 +++-- 1 files changed, 65 insertions(+), 72 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index b4506b4..9754ca9 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -487,72 +487,61 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, #endif typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; int nb_sectors; -int n; -uint64_t cluster_offset; -uint8_t *cluster_data; -struct iovec hd_iov; -bool is_write; -QEMUBH *bh; -QEMUIOVector hd_qiov; -BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { -memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; -acb-hd_aiocb = NULL; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; if (qiov-niov 1) { acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { +acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -acb-n = 0; -acb-cluster_offset = 0; return acb; } static int qcow_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret; - -acb-hd_aiocb = NULL; +int ret, n = 0; +uint64_t cluster_offset = 0; +struct iovec hd_iov; +QEMUIOVector hd_qiov; redo: /* post process the read buffer */ -if (!acb-cluster_offset) { +if (!cluster_offset) { /* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* nothing to do */ } else { if (s-crypt_method) { encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -acb-n, 0, +n, 0, s-aes_decrypt_key); } } -acb-nb_sectors -= acb-n; -acb-sector_num += acb-n; -acb-buf += acb-n * 512; +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; if (acb-nb_sectors == 0) { /* request completed */ @@ -560,57 +549,58 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } /* prepare next AIO request */ -acb-cluster_offset = get_cluster_offset(bs, acb-sector_num 9, +cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -acb-n = s-cluster_sectors - index_in_cluster; -if (acb-n acb-nb_sectors) -acb-n = acb-nb_sectors; +n = s-cluster_sectors - index_in_cluster; +if (n acb-nb_sectors) { +n = acb-nb_sectors; +} -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void *)acb-buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -acb-n, acb-hd_qiov); +n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return -EIO; } } else { /* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * acb-n); +memset(acb-buf, 0, 512 * n); goto redo; } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, acb-cluster_offset) 0) { +if (decompress_cluster(bs, cluster_offset) 0) { return -EIO; } memcpy(acb-buf, - s-cluster_cache + index_in_cluster * 512, 512 * acb-n); + s-cluster_cache + index_in_cluster * 512, 512 * n); goto redo; } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } -acb-hd_iov.iov_base = (void *)acb-buf
[Qemu-devel] [PATCH v3 03/15] qcow: move some blocks of code to avoid useless variable initialization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 53 ++--- 1 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 9754ca9..4ede7f3 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n = 0; -uint64_t cluster_offset = 0; +int ret, n; +uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -/* post process the read buffer */ -if (!cluster_offset) { -/* nothing to do */ -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -n, 0, -s-aes_decrypt_key); -} -} - -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ +/* prepare next request */ cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); @@ -573,7 +556,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } else { /* Note: in this case, no need to wait */ memset(acb-buf, 0, 512 * n); -goto redo; } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -582,7 +564,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } memcpy(acb-buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -goto redo; } else { if ((cluster_offset 511) != 0) { return -EIO; @@ -600,6 +581,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } } +/* post process the read buffer */ +if (!cluster_offset) { +/* nothing to do */ +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* nothing to do */ +} else { +if (s-crypt_method) { +encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, +n, 0, +s-aes_decrypt_key); +} +} + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } @@ -631,16 +629,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; -int ret, n = 0; +int ret, n; uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; @@ -683,6 +677,11 @@ redo: if (ret 0) { return ret; } + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } -- 1.7.1
[Qemu-devel] [PATCH v3 05/15] qcow: remove old #undefined code
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 63 -- 1 files changed, 0 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index f28c821..8cbabdd 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -190,24 +190,6 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) return -1; if (AES_set_decrypt_key(keybuf, 128, s-aes_decrypt_key) != 0) return -1; -#if 0 -/* test */ -{ -uint8_t in[16]; -uint8_t out[16]; -uint8_t tmp[16]; -for(i=0;i16;i++) -in[i] = i; -AES_encrypt(in, tmp, s-aes_encrypt_key); -AES_decrypt(tmp, out, s-aes_decrypt_key); -for(i = 0; i 16; i++) -printf( %02x, tmp[i]); -printf(\n); -for(i = 0; i 16; i++) -printf( %02x, out[i]); -printf(\n); -} -#endif return 0; } @@ -441,51 +423,6 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) return 0; } -#if 0 - -static int qcow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ -BDRVQcowState *s = bs-opaque; -int ret, index_in_cluster, n; -uint64_t cluster_offset; - -while (nb_sectors 0) { -cluster_offset = get_cluster_offset(bs, sector_num 9, 0, 0, 0, 0); -index_in_cluster = sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n nb_sectors) -n = nb_sectors; -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -ret = bdrv_read(bs-backing_hd, sector_num, buf, n); -if (ret 0) -return -1; -} else { -memset(buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -if (decompress_cluster(bs, cluster_offset) 0) -return -1; -memcpy(buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -} else { -ret = bdrv_pread(bs-file, cluster_offset + index_in_cluster * 512, buf, n * 512); -if (ret != n * 512) -return -1; -if (s-crypt_method) { -encrypt_sectors(s, sector_num, buf, buf, n, 0, -s-aes_decrypt_key); -} -} -nb_sectors -= n; -sector_num += n; -buf += n * 512; -} -return 0; -} -#endif - static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { -- 1.7.1
[Qemu-devel] [PATCH v3 12/15] qcow2: removed QCowAIOCB entirely
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 207 ++--- 1 files changed, 80 insertions(+), 127 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 287dd99..d34bd72 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -372,83 +372,77 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -int remaining_sectors; -uint64_t bytes_done; -uint8_t *cluster_data; -QEMUIOVector hd_qiov; -} QCowAIOCB; - -/* - * Returns 0 when the request is completed successfully, 1 when there is still - * a part left to do and -errno in error cases. - */ -static int qcow2_aio_read_cb(QCowAIOCB *acb) +static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; +uint64_t bytes_done = 0; +QEMUIOVector hd_qiov; +uint8_t *cluster_data = NULL; -while (acb-remaining_sectors != 0) { +qemu_iovec_init(hd_qiov, qiov-niov); + +qemu_co_mutex_lock(s-lock); + +while (remaining_sectors != 0) { /* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; +cur_nr_sectors = remaining_sectors; if (s-crypt_method) { cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +ret = qcow2_get_cluster_offset(bs, sector_num 9, cur_nr_sectors, cluster_offset); if (ret 0) { -return ret; +goto fail; } -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); +index_in_cluster = sector_num (s-cluster_sectors - 1); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +qemu_iovec_reset(hd_qiov); +qemu_iovec_copy(hd_qiov, qiov, bytes_done, cur_nr_sectors * 512); if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); +n1 = qcow2_backing_read1(bs-backing_hd, hd_qiov, +sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n1, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return ret; +goto fail; } } } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +qemu_iovec_memset(hd_qiov, 0, 512 * cur_nr_sectors); } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { -return ret; +goto fail; } -qemu_iovec_from_buffer(acb-hd_qiov, +qemu_iovec_from_buffer(hd_qiov, s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { if ((cluster_offset 511) != 0) { -return -EIO; +ret = -EIO; +goto fail; } if (s-crypt_method) { @@ -456,15 +450,15 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) * For encrypted images, read everything into a temporary * contiguous buffer on which the AES functions can work. */ -if (!acb-cluster_data) { -acb-cluster_data = +if (!cluster_data) { +cluster_data = g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_reset(hd_qiov
[Qemu-devel] [PATCH v3 13/15] qcow2: remove memory leak
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index d34bd72..07529b1 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -492,6 +492,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +g_free(cluster_data); return ret; } @@ -604,6 +605,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +g_free(cluster_data); return ret; } -- 1.7.1
[Qemu-devel] [PATCH v3 14/15] qcow2: small math optimization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 2a915be..0f9a64a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -140,10 +140,7 @@ static unsigned int next_refcount_table_size(BDRVQcowState *s, static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, uint64_t offset_b) { -uint64_t block_a = offset_a (2 * s-cluster_bits - REFCOUNT_SHIFT); -uint64_t block_b = offset_b (2 * s-cluster_bits - REFCOUNT_SHIFT); - -return (block_a == block_b); +return ((offset_a ^ offset_b) (2 * s-cluster_bits - REFCOUNT_SHIFT)) == 0; } /* -- 1.7.1
[Qemu-devel] [PATCH v3 15/15] qcow2: small optimization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0f9a64a..243c24b 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -681,14 +681,13 @@ void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, int64_t size) { int refcount; -int64_t start, last, cluster_offset; +int64_t start, last, cluster; uint16_t *p; -start = offset ~(s-cluster_size - 1); -last = (offset + size - 1) ~(s-cluster_size - 1); -for(cluster_offset = start; cluster_offset = last; -cluster_offset += s-cluster_size) { -p = s-refcount_block[cluster_offset s-cluster_bits]; +start = offset s-cluster_bits; +last = (offset + size - 1) s-cluster_bits; +for(cluster = start; cluster = last; ++cluster) { +p = s-refcount_block[cluster]; refcount = be16_to_cpu(*p); refcount++; *p = cpu_to_be16(refcount); -- 1.7.1
[Qemu-devel] [PATCH v3 04/15] qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 291 - 1 files changed, 123 insertions(+), 168 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 4ede7f3..f28c821 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -486,223 +486,178 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, } #endif -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -uint8_t *buf; -void *orig_buf; -int nb_sectors; -} QCowAIOCB; - -static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, -int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write, QCowAIOCB *acb) -{ -acb-bs = bs; -acb-sector_num = sector_num; -acb-qiov = qiov; - -if (qiov-niov 1) { -acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); -if (is_write) -qemu_iovec_to_buffer(qiov, acb-buf); -} else { -acb-orig_buf = NULL; -acb-buf = (uint8_t *)qiov-iov-iov_base; -} -acb-nb_sectors = nb_sectors; -return acb; -} - -static int qcow_aio_read_cb(QCowAIOCB *acb) +static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n; +int ret = 0, n; uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; +uint8_t *buf; +void *orig_buf; - redo: -if (acb-nb_sectors == 0) { -/* request completed */ -return 0; +if (qiov-niov 1) { +buf = orig_buf = qemu_blockalign(bs, qiov-size); +} else { +orig_buf = NULL; +buf = (uint8_t *)qiov-iov-iov_base; } -/* prepare next request */ -cluster_offset = get_cluster_offset(bs, acb-sector_num 9, - 0, 0, 0, 0); -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n acb-nb_sectors) { -n = acb-nb_sectors; -} +qemu_co_mutex_lock(s-lock); + +while (nb_sectors != 0) { +/* prepare next request */ +cluster_offset = get_cluster_offset(bs, sector_num 9, + 0, 0, 0, 0); +index_in_cluster = sector_num (s-cluster_sectors - 1); +n = s-cluster_sectors - index_in_cluster; +if (n nb_sectors) { +n = nb_sectors; +} -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -hd_iov.iov_base = (void *)acb-buf; +if (!cluster_offset) { +if (bs-backing_hd) { +/* read from the base image */ +hd_iov.iov_base = (void *)buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n, hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +goto fail; +} +} else { +/* Note: in this case, no need to wait */ +memset(buf, 0, 512 * n); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +if (decompress_cluster(bs, cluster_offset) 0) { +goto fail; +} +memcpy(buf, + s-cluster_cache + index_in_cluster * 512, 512 * n); +} else { +if ((cluster_offset 511) != 0) { +goto fail; +} +hd_iov.iov_base = (void *)buf; hd_iov.iov_len = n * 512; qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +ret = bdrv_co_readv(bs-file, +(cluster_offset 9) + index_in_cluster, n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return -EIO; +break; +} +if (s-crypt_method) { +encrypt_sectors(s, sector_num, buf, buf, +n, 0, +s-aes_decrypt_key); } -} else { -/* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, cluster_offset) 0) { -return -EIO; -} -memcpy(acb-buf, - s
[Qemu-devel] [PATCH v3 09/15] qcow2: remove cluster_offset from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 22 +++--- 1 files changed, 11 insertions(+), 11 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 4b9ec29..7e13283 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -378,7 +378,6 @@ typedef struct QCowAIOCB { QEMUIOVector *qiov; int remaining_sectors; uint64_t bytes_done; -uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; } QCowAIOCB; @@ -394,6 +393,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +uint64_t cluster_offset = 0; if (acb-remaining_sectors == 0) { /* request completed */ @@ -408,7 +408,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } @@ -419,7 +419,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, cur_nr_sectors * 512); -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ @@ -439,9 +439,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) /* Note: in this case, no need to wait */ qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, acb-cluster_offset); +ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { return ret; } @@ -450,7 +450,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } @@ -474,7 +474,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, -(acb-cluster_offset 9) + index_in_cluster, +(cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { @@ -512,7 +512,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; -acb-cluster_offset = 0; return acb; } @@ -564,6 +563,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int ret; int cur_nr_sectors; /* number of sectors in current iteration */ QCowL2Meta l2meta; +uint64_t cluster_offset; l2meta.nb_clusters = 0; qemu_co_queue_init(l2meta.dependent_requests); @@ -585,8 +585,8 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -acb-cluster_offset = l2meta.cluster_offset; -assert((acb-cluster_offset 511) == 0); +cluster_offset = l2meta.cluster_offset; +assert((cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, @@ -612,7 +612,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_WRITE_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_writev(bs-file, - (acb-cluster_offset 9) + index_in_cluster, + (cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -- 1.7.1
[Qemu-devel] [PATCH v3 10/15] qcow2: remove common from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 7e13283..519fc8b 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -373,7 +373,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, } typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; @@ -388,7 +388,7 @@ typedef struct QCowAIOCB { */ static int qcow2_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; @@ -504,7 +504,7 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -556,7 +556,7 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) */ static int qcow2_aio_write_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; int n_end; -- 1.7.1
Re: [Qemu-devel] [PATCH v2 00/15] qcow/qcow2 cleanups
2011/8/22 Kevin Wolf kw...@redhat.com: Am 09.08.2011 09:46, schrieb Frediano Ziglio: These patches mostly cleanup some AIO code using coroutines. Mostly they use stack instead of allocated AIO structure. Feel free to collapse it too short. Frediano Ziglio (15): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev qcow: remove old #undefined code qcow2: removed unused fields qcow2: removed cur_nr_sectors field in QCowAIOCB qcow2: remove l2meta from QCowAIOCB qcow2: remove cluster_offset from QCowAIOCB qcow2: remove common from QCowAIOCB qcow2: reindent and use while before the big jump qcow2: removed QCowAIOCB entirely qcow2: remove memory leak qcow2: small math optimization qcow2: small optimization block/qcow.c | 378 ++-- block/qcow2-refcount.c | 16 +-- block/qcow2.c | 412 +++ 3 files changed, 294 insertions(+), 512 deletions(-) Can you please rebase this series to current master? I expect that most conflicts are related to the qemu_malloc - g_malloc change, so they should be easy to fix. Done, yes glib rename was the problem. I rebased all patches and tested latest with iotests. One thing I noticed when reading the first two patches is that there is some state in ACBs that previously was shared across multiple requests (e.g. the bounce buffer for encryption). It is no longer shared after you move the ACB to the stack, so each request allocates a new buffer. Have you checked if this makes a noticeable difference in performance for encrypted images? Kevin I didn't test encrypt performance. Looking at master code the only cached state that get reused is cluster_data so only encryption is affected. Frediano
[Qemu-devel] [PATCH v3 06/15] qcow2: removed unused fields
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 10 +++--- 1 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index bb6c75e..4171b47 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,11 +381,8 @@ typedef struct QCowAIOCB { uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; -bool is_write; QEMUIOVector hd_qiov; -QEMUBH *bh; QCowL2Meta l2meta; -QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; /* @@ -517,13 +514,12 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write, QCowAIOCB *acb) + void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; qemu_iovec_init(acb-hd_qiov, qiov-niov); @@ -543,7 +539,7 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); qemu_co_mutex_lock(s-lock); do { @@ -658,7 +654,7 @@ static int qcow2_co_writev(BlockDriverState *bs, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); s-cluster_cache_offset = -1; /* disable compressed cache */ qemu_co_mutex_lock(s-lock); -- 1.7.1
[Qemu-devel] [PATCH v3 07/15] qcow2: removed cur_nr_sectors field in QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 98 + 1 files changed, 43 insertions(+), 55 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 4171b47..f920dbe 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -377,7 +377,6 @@ typedef struct QCowAIOCB { int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; -int cur_nr_sectors;/* number of sectors in current iteration */ uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; @@ -395,42 +394,22 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; - -/* post process the read buffer */ -if (!acb-cluster_offset) { -/* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, -acb-cluster_data, acb-cur_nr_sectors, 0, s-aes_decrypt_key); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); -qemu_iovec_from_buffer(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); -} -} - -acb-remaining_sectors -= acb-cur_nr_sectors; -acb-sector_num += acb-cur_nr_sectors; -acb-bytes_done += acb-cur_nr_sectors * 512; +int cur_nr_sectors; /* number of sectors in current iteration */ if (acb-remaining_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ -acb-cur_nr_sectors = acb-remaining_sectors; +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; if (s-crypt_method) { -acb-cur_nr_sectors = MIN(acb-cur_nr_sectors, +cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -acb-cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, acb-cluster_offset); if (ret 0) { return ret; } @@ -439,14 +418,14 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); +cur_nr_sectors * 512); if (!acb-cluster_offset) { if (bs-backing_hd) { /* read from the base image */ n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, acb-cur_nr_sectors); +acb-sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); @@ -457,11 +436,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) return ret; } } -return 1; } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * acb-cur_nr_sectors); -return 1; +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } } else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -472,9 +449,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_from_buffer(acb-hd_qiov, s-cluster_cache + index_in_cluster * 512, -512 * acb-cur_nr_sectors); - -return 1; +512 * cur_nr_sectors); } else { if ((acb-cluster_offset 511) != 0) { return -EIO; @@ -490,24 +465,37 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } -assert(acb-cur_nr_sectors = +assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_add(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); +512 * cur_nr_sectors); } BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, (acb-cluster_offset 9) + index_in_cluster, -acb-cur_nr_sectors, acb-hd_qiov); +cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return ret; } +if (s-crypt_method) { +qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, +acb-cluster_data, cur_nr_sectors, 0, s-aes_decrypt_key); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb-qiov
[Qemu-devel] [PATCH v3 11/15] qcow2: reindent and use while before the big jump
prepare to remove read/write callbacks Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 272 - 1 files changed, 135 insertions(+), 137 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 519fc8b..287dd99 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -395,107 +395,105 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; -if (acb-remaining_sectors == 0) { -/* request completed */ -return 0; -} - -/* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; -if (s-crypt_method) { -cur_nr_sectors = MIN(cur_nr_sectors, -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -} +while (acb-remaining_sectors != 0) { -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, cluster_offset); -if (ret 0) { -return ret; -} - -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); - -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -cur_nr_sectors * 512); - -if (!cluster_offset) { - -if (bs-backing_hd) { -/* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); -if (n1 0) { -BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); -qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); -qemu_co_mutex_lock(s-lock); -if (ret 0) { -return ret; -} -} -} else { -/* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; +if (s-crypt_method) { +cur_nr_sectors = MIN(cur_nr_sectors, +QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, cluster_offset); + +ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } -qemu_iovec_from_buffer(acb-hd_qiov, -s-cluster_cache + index_in_cluster * 512, -512 * cur_nr_sectors); -} else { -if ((cluster_offset 511) != 0) { -return -EIO; -} +index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -if (s-crypt_method) { -/* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ -if (!acb-cluster_data) { -acb-cluster_data = -g_malloc0(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +cur_nr_sectors * 512); + +if (!cluster_offset) { + +if (bs-backing_hd) { +/* read from the base image */ +n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, +acb-sector_num, cur_nr_sectors); +if (n1 0) { +BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +n1, acb-hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +return ret; +} +} +} else { +/* Note: in this case, no need to wait */ +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +ret = qcow2_decompress_cluster(bs, cluster_offset); +if (ret 0) { +return ret; } -assert(cur_nr_sectors = -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_from_buffer(acb-hd_qiov, +s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); -} +} else { +if ((cluster_offset 511) != 0) { +return -EIO; +} -BLKDBG_EVENT(bs-file
[Qemu-devel] [PATCH v3 08/15] qcow2: remove l2meta from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 15 --- 1 files changed, 8 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index f920dbe..4b9ec29 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,7 +381,6 @@ typedef struct QCowAIOCB { uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; -QCowL2Meta l2meta; } QCowAIOCB; /* @@ -514,8 +513,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; acb-cluster_offset = 0; -acb-l2meta.nb_clusters = 0; -qemu_co_queue_init(acb-l2meta.dependent_requests); return acb; } @@ -566,6 +563,10 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int n_end; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +QCowL2Meta l2meta; + +l2meta.nb_clusters = 0; +qemu_co_queue_init(l2meta.dependent_requests); if (acb-remaining_sectors == 0) { /* request completed */ @@ -579,12 +580,12 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) n_end = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors; ret = qcow2_alloc_cluster_offset(bs, acb-sector_num 9, -index_in_cluster, n_end, cur_nr_sectors, acb-l2meta); +index_in_cluster, n_end, cur_nr_sectors, l2meta); if (ret 0) { return ret; } -acb-cluster_offset = acb-l2meta.cluster_offset; +acb-cluster_offset = l2meta.cluster_offset; assert((acb-cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); @@ -618,9 +619,9 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -ret = qcow2_alloc_cluster_link_l2(bs, acb-l2meta); +ret = qcow2_alloc_cluster_link_l2(bs, l2meta); -run_dependent_requests(s, acb-l2meta); +run_dependent_requests(s, l2meta); if (ret 0) { return ret; -- 1.7.1
[Qemu-devel] full allocation and file fragmentation.
Looking at you patch for full image preallocation I noted you used bdrv_write to fill space with zeroes. Nothing wrong but I would add a call to fallocate or posix_fallocate in order to get less fragmentation on file system. Perhaps there should be a bdrv_preallocate call ?? Regards Frediano
[Qemu-devel] [PATCH v2 00/15] qcow/qcow2 cleanups
These patches mostly cleanup some AIO code using coroutines. Mostly they use stack instead of allocated AIO structure. Feel free to collapse it too short. Frediano Ziglio (15): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev qcow: remove old #undefined code qcow2: removed unused fields qcow2: removed cur_nr_sectors field in QCowAIOCB qcow2: remove l2meta from QCowAIOCB qcow2: remove cluster_offset from QCowAIOCB qcow2: remove common from QCowAIOCB qcow2: reindent and use while before the big jump qcow2: removed QCowAIOCB entirely qcow2: remove memory leak qcow2: small math optimization qcow2: small optimization block/qcow.c | 378 ++-- block/qcow2-refcount.c | 16 +-- block/qcow2.c | 412 +++ 3 files changed, 294 insertions(+), 512 deletions(-)
[Qemu-devel] [PATCH v2 02/15] qcow: QCowAIOCB field cleanup
remove unused field from this structure and put some of them in qcow_aio_read_cb and qcow_aio_write_cb Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 137 +++-- 1 files changed, 65 insertions(+), 72 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index d19ef04..f0b1599 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -487,72 +487,61 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, #endif typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; int nb_sectors; -int n; -uint64_t cluster_offset; -uint8_t *cluster_data; -struct iovec hd_iov; -bool is_write; -QEMUBH *bh; -QEMUIOVector hd_qiov; -BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { -memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; -acb-hd_aiocb = NULL; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; if (qiov-niov 1) { acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { +acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -acb-n = 0; -acb-cluster_offset = 0; return acb; } static int qcow_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret; - -acb-hd_aiocb = NULL; +int ret, n = 0; +uint64_t cluster_offset = 0; +struct iovec hd_iov; +QEMUIOVector hd_qiov; redo: /* post process the read buffer */ -if (!acb-cluster_offset) { +if (!cluster_offset) { /* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* nothing to do */ } else { if (s-crypt_method) { encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -acb-n, 0, +n, 0, s-aes_decrypt_key); } } -acb-nb_sectors -= acb-n; -acb-sector_num += acb-n; -acb-buf += acb-n * 512; +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; if (acb-nb_sectors == 0) { /* request completed */ @@ -560,57 +549,58 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } /* prepare next AIO request */ -acb-cluster_offset = get_cluster_offset(bs, acb-sector_num 9, +cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -acb-n = s-cluster_sectors - index_in_cluster; -if (acb-n acb-nb_sectors) -acb-n = acb-nb_sectors; +n = s-cluster_sectors - index_in_cluster; +if (n acb-nb_sectors) { +n = acb-nb_sectors; +} -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void *)acb-buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -acb-n, acb-hd_qiov); +n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return -EIO; } } else { /* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * acb-n); +memset(acb-buf, 0, 512 * n); goto redo; } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, acb-cluster_offset) 0) { +if (decompress_cluster(bs, cluster_offset) 0) { return -EIO; } memcpy(acb-buf, - s-cluster_cache + index_in_cluster * 512, 512 * acb-n); + s-cluster_cache + index_in_cluster * 512, 512 * n); goto redo; } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } -acb-hd_iov.iov_base = (void *)acb-buf
[Qemu-devel] [PATCH v2 03/15] qcow: move some blocks of code to avoid useless variable initialization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 53 ++--- 1 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index f0b1599..e17f9c5 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n = 0; -uint64_t cluster_offset = 0; +int ret, n; +uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -/* post process the read buffer */ -if (!cluster_offset) { -/* nothing to do */ -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -n, 0, -s-aes_decrypt_key); -} -} - -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ +/* prepare next request */ cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); @@ -573,7 +556,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } else { /* Note: in this case, no need to wait */ memset(acb-buf, 0, 512 * n); -goto redo; } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -582,7 +564,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } memcpy(acb-buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -goto redo; } else { if ((cluster_offset 511) != 0) { return -EIO; @@ -600,6 +581,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } } +/* post process the read buffer */ +if (!cluster_offset) { +/* nothing to do */ +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* nothing to do */ +} else { +if (s-crypt_method) { +encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, +n, 0, +s-aes_decrypt_key); +} +} + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } @@ -631,16 +629,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; -int ret, n = 0; +int ret, n; uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; @@ -683,6 +677,11 @@ redo: if (ret 0) { return ret; } + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } -- 1.7.1
[Qemu-devel] [PATCH v2 01/15] qcow: allocate QCowAIOCB structure using stack
instead of calling qemi_aio_get use stack Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 52 block/qcow2.c | 38 +++--- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 6447c2a..d19ef04 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -503,28 +503,12 @@ typedef struct QCowAIOCB { BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -if (acb-hd_aiocb) -bdrv_aio_cancel(acb-hd_aiocb); -qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow_aio_cancel, -}; - static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write) +int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-hd_aiocb = NULL; acb-sector_num = sector_num; acb-qiov = qiov; @@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_aio_read_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_read_cb(acb); +ret = qcow_aio_read_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } -static int qcow_aio_write_cb(void *opaque) +static int qcow_aio_write_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; s-cluster_cache_offset = -1; /* disable compressed cache */ -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_write_cb(acb); +ret = qcow_aio_write_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index f07d550..edc068e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -388,17 +388,6 @@ typedef struct QCowAIOCB { QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow2_aio_cancel, -}; - /* * Returns 0 when the request is completed successfully, 1 when there is still * a part left to do and -errno in error cases. @@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; acb-is_write = is_write; @@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int
[Qemu-devel] [PATCH v2 15/15] qcow2: small optimization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 0e31868..318d66d 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -681,14 +681,13 @@ void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, int64_t size) { int refcount; -int64_t start, last, cluster_offset; +int64_t start, last, cluster; uint16_t *p; -start = offset ~(s-cluster_size - 1); -last = (offset + size - 1) ~(s-cluster_size - 1); -for(cluster_offset = start; cluster_offset = last; -cluster_offset += s-cluster_size) { -p = s-refcount_block[cluster_offset s-cluster_bits]; +start = offset s-cluster_bits; +last = (offset + size - 1) s-cluster_bits; +for (cluster = start; cluster = last; ++cluster) { +p = s-refcount_block[cluster]; refcount = be16_to_cpu(*p); refcount++; *p = cpu_to_be16(refcount); -- 1.7.1
[Qemu-devel] [PATCH v2 13/15] qcow2: remove memory leak
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 6073568..aed8da7 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -492,6 +492,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +qemu_free(cluster_data); return ret; } @@ -604,6 +605,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +qemu_free(cluster_data); return ret; } -- 1.7.1
[Qemu-devel] [PATCH v2 14/15] qcow2: small math optimization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-refcount.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 14b2f67..0e31868 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -140,10 +140,7 @@ static unsigned int next_refcount_table_size(BDRVQcowState *s, static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, uint64_t offset_b) { -uint64_t block_a = offset_a (2 * s-cluster_bits - REFCOUNT_SHIFT); -uint64_t block_b = offset_b (2 * s-cluster_bits - REFCOUNT_SHIFT); - -return (block_a == block_b); +return !((offset_a ^ offset_b) (2 * s-cluster_bits - REFCOUNT_SHIFT)); } /* -- 1.7.1
[Qemu-devel] [PATCH v2 11/15] qcow2: reindent and use while before the big jump
prepare to remove read/write callbacks Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 272 - 1 files changed, 135 insertions(+), 137 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index c7445cc..dfb969e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -395,107 +395,105 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; -if (acb-remaining_sectors == 0) { -/* request completed */ -return 0; -} - -/* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; -if (s-crypt_method) { -cur_nr_sectors = MIN(cur_nr_sectors, -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -} +while (acb-remaining_sectors != 0) { -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, cluster_offset); -if (ret 0) { -return ret; -} - -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); - -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -cur_nr_sectors * 512); - -if (!cluster_offset) { - -if (bs-backing_hd) { -/* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); -if (n1 0) { -BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); -qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); -qemu_co_mutex_lock(s-lock); -if (ret 0) { -return ret; -} -} -} else { -/* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; +if (s-crypt_method) { +cur_nr_sectors = MIN(cur_nr_sectors, +QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, cluster_offset); + +ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } -qemu_iovec_from_buffer(acb-hd_qiov, -s-cluster_cache + index_in_cluster * 512, -512 * cur_nr_sectors); -} else { -if ((cluster_offset 511) != 0) { -return -EIO; -} +index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -if (s-crypt_method) { -/* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ -if (!acb-cluster_data) { -acb-cluster_data = -qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +cur_nr_sectors * 512); + +if (!cluster_offset) { + +if (bs-backing_hd) { +/* read from the base image */ +n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, +acb-sector_num, cur_nr_sectors); +if (n1 0) { +BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +n1, acb-hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +return ret; +} +} +} else { +/* Note: in this case, no need to wait */ +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +ret = qcow2_decompress_cluster(bs, cluster_offset); +if (ret 0) { +return ret; } -assert(cur_nr_sectors = -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_from_buffer(acb-hd_qiov, +s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); -} +} else { +if ((cluster_offset 511) != 0) { +return -EIO; +} -BLKDBG_EVENT(bs-file
[Qemu-devel] [PATCH v2 10/15] qcow2: remove common from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 446946e..c7445cc 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -373,7 +373,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, } typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; @@ -388,7 +388,7 @@ typedef struct QCowAIOCB { */ static int qcow2_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; @@ -504,7 +504,7 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -556,7 +556,7 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) */ static int qcow2_aio_write_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; int n_end; -- 1.7.1
[Qemu-devel] [PATCH v2 06/15] qcow2: removed unused fields
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 10 +++--- 1 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index edc068e..5c454bb 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,11 +381,8 @@ typedef struct QCowAIOCB { uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; -bool is_write; QEMUIOVector hd_qiov; -QEMUBH *bh; QCowL2Meta l2meta; -QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; /* @@ -517,13 +514,12 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write, QCowAIOCB *acb) + void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; qemu_iovec_init(acb-hd_qiov, qiov-niov); @@ -543,7 +539,7 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); qemu_co_mutex_lock(s-lock); do { @@ -658,7 +654,7 @@ static int qcow2_co_writev(BlockDriverState *bs, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); s-cluster_cache_offset = -1; /* disable compressed cache */ qemu_co_mutex_lock(s-lock); -- 1.7.1
[Qemu-devel] [PATCH v2 07/15] qcow2: removed cur_nr_sectors field in QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 98 + 1 files changed, 43 insertions(+), 55 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 5c454bb..0cf4465 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -377,7 +377,6 @@ typedef struct QCowAIOCB { int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; -int cur_nr_sectors;/* number of sectors in current iteration */ uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; @@ -395,42 +394,22 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; - -/* post process the read buffer */ -if (!acb-cluster_offset) { -/* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, -acb-cluster_data, acb-cur_nr_sectors, 0, s-aes_decrypt_key); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); -qemu_iovec_from_buffer(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); -} -} - -acb-remaining_sectors -= acb-cur_nr_sectors; -acb-sector_num += acb-cur_nr_sectors; -acb-bytes_done += acb-cur_nr_sectors * 512; +int cur_nr_sectors; /* number of sectors in current iteration */ if (acb-remaining_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ -acb-cur_nr_sectors = acb-remaining_sectors; +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; if (s-crypt_method) { -acb-cur_nr_sectors = MIN(acb-cur_nr_sectors, +cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -acb-cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, acb-cluster_offset); if (ret 0) { return ret; } @@ -439,14 +418,14 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); +cur_nr_sectors * 512); if (!acb-cluster_offset) { if (bs-backing_hd) { /* read from the base image */ n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, acb-cur_nr_sectors); +acb-sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); @@ -457,11 +436,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) return ret; } } -return 1; } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * acb-cur_nr_sectors); -return 1; +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } } else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -472,9 +449,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_from_buffer(acb-hd_qiov, s-cluster_cache + index_in_cluster * 512, -512 * acb-cur_nr_sectors); - -return 1; +512 * cur_nr_sectors); } else { if ((acb-cluster_offset 511) != 0) { return -EIO; @@ -490,24 +465,37 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } -assert(acb-cur_nr_sectors = +assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_add(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); +512 * cur_nr_sectors); } BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, (acb-cluster_offset 9) + index_in_cluster, -acb-cur_nr_sectors, acb-hd_qiov); +cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return ret; } +if (s-crypt_method) { +qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, +acb-cluster_data, cur_nr_sectors, 0, s-aes_decrypt_key); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb
[Qemu-devel] [PATCH v2 08/15] qcow2: remove l2meta from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 15 --- 1 files changed, 8 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 0cf4465..adf31ce 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,7 +381,6 @@ typedef struct QCowAIOCB { uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; -QCowL2Meta l2meta; } QCowAIOCB; /* @@ -514,8 +513,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; acb-cluster_offset = 0; -acb-l2meta.nb_clusters = 0; -qemu_co_queue_init(acb-l2meta.dependent_requests); return acb; } @@ -566,6 +563,10 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int n_end; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +QCowL2Meta l2meta; + +l2meta.nb_clusters = 0; +qemu_co_queue_init(l2meta.dependent_requests); if (acb-remaining_sectors == 0) { /* request completed */ @@ -579,12 +580,12 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) n_end = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors; ret = qcow2_alloc_cluster_offset(bs, acb-sector_num 9, -index_in_cluster, n_end, cur_nr_sectors, acb-l2meta); +index_in_cluster, n_end, cur_nr_sectors, l2meta); if (ret 0) { return ret; } -acb-cluster_offset = acb-l2meta.cluster_offset; +acb-cluster_offset = l2meta.cluster_offset; assert((acb-cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); @@ -618,9 +619,9 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -ret = qcow2_alloc_cluster_link_l2(bs, acb-l2meta); +ret = qcow2_alloc_cluster_link_l2(bs, l2meta); -run_dependent_requests(s, acb-l2meta); +run_dependent_requests(s, l2meta); if (ret 0) { return ret; -- 1.7.1
[Qemu-devel] [PATCH v2 09/15] qcow2: remove cluster_offset from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 22 +++--- 1 files changed, 11 insertions(+), 11 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index adf31ce..446946e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -378,7 +378,6 @@ typedef struct QCowAIOCB { QEMUIOVector *qiov; int remaining_sectors; uint64_t bytes_done; -uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; } QCowAIOCB; @@ -394,6 +393,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +uint64_t cluster_offset = 0; if (acb-remaining_sectors == 0) { /* request completed */ @@ -408,7 +408,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } @@ -419,7 +419,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, cur_nr_sectors * 512); -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ @@ -439,9 +439,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) /* Note: in this case, no need to wait */ qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, acb-cluster_offset); +ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { return ret; } @@ -450,7 +450,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } @@ -474,7 +474,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, -(acb-cluster_offset 9) + index_in_cluster, +(cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { @@ -512,7 +512,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; -acb-cluster_offset = 0; return acb; } @@ -564,6 +563,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int ret; int cur_nr_sectors; /* number of sectors in current iteration */ QCowL2Meta l2meta; +uint64_t cluster_offset; l2meta.nb_clusters = 0; qemu_co_queue_init(l2meta.dependent_requests); @@ -585,8 +585,8 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -acb-cluster_offset = l2meta.cluster_offset; -assert((acb-cluster_offset 511) == 0); +cluster_offset = l2meta.cluster_offset; +assert((cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, @@ -612,7 +612,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_WRITE_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_writev(bs-file, - (acb-cluster_offset 9) + index_in_cluster, + (cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -- 1.7.1
[Qemu-devel] [PATCH v2 05/15] qcow: remove old #undefined code
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 63 -- 1 files changed, 0 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 00f339f..0989799 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -190,24 +190,6 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) return -1; if (AES_set_decrypt_key(keybuf, 128, s-aes_decrypt_key) != 0) return -1; -#if 0 -/* test */ -{ -uint8_t in[16]; -uint8_t out[16]; -uint8_t tmp[16]; -for(i=0;i16;i++) -in[i] = i; -AES_encrypt(in, tmp, s-aes_encrypt_key); -AES_decrypt(tmp, out, s-aes_decrypt_key); -for(i = 0; i 16; i++) -printf( %02x, tmp[i]); -printf(\n); -for(i = 0; i 16; i++) -printf( %02x, out[i]); -printf(\n); -} -#endif return 0; } @@ -441,51 +423,6 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) return 0; } -#if 0 - -static int qcow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ -BDRVQcowState *s = bs-opaque; -int ret, index_in_cluster, n; -uint64_t cluster_offset; - -while (nb_sectors 0) { -cluster_offset = get_cluster_offset(bs, sector_num 9, 0, 0, 0, 0); -index_in_cluster = sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n nb_sectors) -n = nb_sectors; -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -ret = bdrv_read(bs-backing_hd, sector_num, buf, n); -if (ret 0) -return -1; -} else { -memset(buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -if (decompress_cluster(bs, cluster_offset) 0) -return -1; -memcpy(buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -} else { -ret = bdrv_pread(bs-file, cluster_offset + index_in_cluster * 512, buf, n * 512); -if (ret != n * 512) -return -1; -if (s-crypt_method) { -encrypt_sectors(s, sector_num, buf, buf, n, 0, -s-aes_decrypt_key); -} -} -nb_sectors -= n; -sector_num += n; -buf += n * 512; -} -return 0; -} -#endif - static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { -- 1.7.1
[Qemu-devel] [PATCH v2 12/15] qcow2: removed QCowAIOCB entirely
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 207 ++--- 1 files changed, 80 insertions(+), 127 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index dfb969e..6073568 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -372,83 +372,77 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -int remaining_sectors; -uint64_t bytes_done; -uint8_t *cluster_data; -QEMUIOVector hd_qiov; -} QCowAIOCB; - -/* - * Returns 0 when the request is completed successfully, 1 when there is still - * a part left to do and -errno in error cases. - */ -static int qcow2_aio_read_cb(QCowAIOCB *acb) +static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; +uint64_t bytes_done = 0; +QEMUIOVector hd_qiov; +uint8_t *cluster_data = NULL; -while (acb-remaining_sectors != 0) { +qemu_iovec_init(hd_qiov, qiov-niov); + +qemu_co_mutex_lock(s-lock); + +while (remaining_sectors != 0) { /* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; +cur_nr_sectors = remaining_sectors; if (s-crypt_method) { cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +ret = qcow2_get_cluster_offset(bs, sector_num 9, cur_nr_sectors, cluster_offset); if (ret 0) { -return ret; +goto fail; } -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); +index_in_cluster = sector_num (s-cluster_sectors - 1); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +qemu_iovec_reset(hd_qiov); +qemu_iovec_copy(hd_qiov, qiov, bytes_done, cur_nr_sectors * 512); if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); +n1 = qcow2_backing_read1(bs-backing_hd, hd_qiov, +sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n1, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return ret; +goto fail; } } } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +qemu_iovec_memset(hd_qiov, 0, 512 * cur_nr_sectors); } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { -return ret; +goto fail; } -qemu_iovec_from_buffer(acb-hd_qiov, +qemu_iovec_from_buffer(hd_qiov, s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { if ((cluster_offset 511) != 0) { -return -EIO; +ret = -EIO; +goto fail; } if (s-crypt_method) { @@ -456,15 +450,15 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) * For encrypted images, read everything into a temporary * contiguous buffer on which the AES functions can work. */ -if (!acb-cluster_data) { -acb-cluster_data = +if (!cluster_data) { +cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_reset(hd_qiov
[Qemu-devel] [PATCH v2 04/15] qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 291 - 1 files changed, 123 insertions(+), 168 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index e17f9c5..00f339f 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -486,223 +486,178 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, } #endif -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -uint8_t *buf; -void *orig_buf; -int nb_sectors; -} QCowAIOCB; - -static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, -int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write, QCowAIOCB *acb) -{ -acb-bs = bs; -acb-sector_num = sector_num; -acb-qiov = qiov; - -if (qiov-niov 1) { -acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); -if (is_write) -qemu_iovec_to_buffer(qiov, acb-buf); -} else { -acb-orig_buf = NULL; -acb-buf = (uint8_t *)qiov-iov-iov_base; -} -acb-nb_sectors = nb_sectors; -return acb; -} - -static int qcow_aio_read_cb(QCowAIOCB *acb) +static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n; +int ret = 0, n; uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; +uint8_t *buf; +void *orig_buf; - redo: -if (acb-nb_sectors == 0) { -/* request completed */ -return 0; +if (qiov-niov 1) { +buf = orig_buf = qemu_blockalign(bs, qiov-size); +} else { +orig_buf = NULL; +buf = (uint8_t *)qiov-iov-iov_base; } -/* prepare next request */ -cluster_offset = get_cluster_offset(bs, acb-sector_num 9, - 0, 0, 0, 0); -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n acb-nb_sectors) { -n = acb-nb_sectors; -} +qemu_co_mutex_lock(s-lock); + +while (nb_sectors != 0) { +/* prepare next request */ +cluster_offset = get_cluster_offset(bs, sector_num 9, + 0, 0, 0, 0); +index_in_cluster = sector_num (s-cluster_sectors - 1); +n = s-cluster_sectors - index_in_cluster; +if (n nb_sectors) { +n = nb_sectors; +} -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -hd_iov.iov_base = (void *)acb-buf; +if (!cluster_offset) { +if (bs-backing_hd) { +/* read from the base image */ +hd_iov.iov_base = (void *)buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n, hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +goto fail; +} +} else { +/* Note: in this case, no need to wait */ +memset(buf, 0, 512 * n); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +if (decompress_cluster(bs, cluster_offset) 0) { +goto fail; +} +memcpy(buf, + s-cluster_cache + index_in_cluster * 512, 512 * n); +} else { +if ((cluster_offset 511) != 0) { +goto fail; +} +hd_iov.iov_base = (void *)buf; hd_iov.iov_len = n * 512; qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +ret = bdrv_co_readv(bs-file, +(cluster_offset 9) + index_in_cluster, n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return -EIO; +break; +} +if (s-crypt_method) { +encrypt_sectors(s, sector_num, buf, buf, +n, 0, +s-aes_decrypt_key); } -} else { -/* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, cluster_offset) 0) { -return -EIO; -} -memcpy(acb-buf, - s
Re: [Qemu-devel] [PATCH] posix-aio-compat: fix latency issues
2011/8/8 Avi Kivity a...@redhat.com: In certain circumstances, posix-aio-compat can incur a lot of latency: - threads are created by vcpu threads, so if vcpu affinity is set, aio threads inherit vcpu affinity. This can cause many aio threads to compete for one cpu. - we can create up to max_threads (64) aio threads in one go; since a pthread_create can take around 30μs, we have up to 2ms of cpu time under a global lock. Fix by: - moving thread creation to the main thread, so we inherit the main thread's affinity instead of the vcpu thread's affinity. - if a thread is currently being created, and we need to create yet another thread, let thread being born create the new thread, reducing the amount of time we spend under the main thread. - drop the local lock while creating a thread (we may still hold the global mutex, though) Note this doesn't eliminate latency completely; scheduler artifacts or lack of host cpu resources can still cause it. We may want pre-allocated threads when this cannot be tolerated. Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions. Signed-off-by: Avi Kivity a...@redhat.com Why not calling pthread_attr_setaffinity_np (where available) before thread creation or shed_setaffinity at thread start instead of telling another thread to create a thread for us just to get affinity cleared? Regards Frediano --- posix-aio-compat.c | 48 ++-- 1 files changed, 46 insertions(+), 2 deletions(-) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 8dc00cb..aa30673 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -30,6 +30,7 @@ #include block/raw-posix-aio.h +static void do_spawn_thread(void); struct qemu_paiocb { BlockDriverAIOCB common; @@ -64,6 +65,9 @@ static pthread_attr_t attr; static int max_threads = 64; static int cur_threads = 0; static int idle_threads = 0; +static int new_threads = 0; /* backlog of threads we need to create */ +static int pending_threads = 0; /* threads created but not running yet */ +static QEMUBH *new_thread_bh; static QTAILQ_HEAD(, qemu_paiocb) request_list; #ifdef CONFIG_PREADV @@ -311,6 +315,13 @@ static void *aio_thread(void *unused) pid = getpid(); + mutex_lock(lock); + if (new_threads) { + do_spawn_thread(); + } + pending_threads--; + mutex_unlock(lock); + while (1) { struct qemu_paiocb *aiocb; ssize_t ret = 0; @@ -381,11 +392,18 @@ static void *aio_thread(void *unused) return NULL; } -static void spawn_thread(void) +static void do_spawn_thread(void) { sigset_t set, oldset; - cur_threads++; + if (!new_threads) { + return; + } + + new_threads--; + pending_threads++; + + mutex_unlock(lock); /* block all signals */ if (sigfillset(set)) die(sigfillset); @@ -394,6 +412,31 @@ static void spawn_thread(void) thread_create(thread_id, attr, aio_thread, NULL); if (sigprocmask(SIG_SETMASK, oldset, NULL)) die(sigprocmask restore); + + mutex_lock(lock); +} + +static void spawn_thread_bh_fn(void *opaque) +{ + mutex_lock(lock); + do_spawn_thread(); + mutex_unlock(lock); +} + +static void spawn_thread(void) +{ + cur_threads++; + new_threads++; + /* If there are threads being created, they will spawn new workers, so + * we don't spend time creating many threads in a loop holding a mutex or + * starving the current vcpu. + * + * If there are no idle threads, ask the main thread to create one, so we + * inherit the correct affinity instead of the vcpu affinity. + */ + if (!pending_threads) { + qemu_bh_schedule(new_thread_bh); + } } static void qemu_paio_submit(struct qemu_paiocb *aiocb) @@ -665,6 +708,7 @@ int paio_init(void) die2(ret, pthread_attr_setdetachstate); QTAILQ_INIT(request_list); + new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL); posix_aio_state = s; return 0; -- 1.7.5.3
Re: [Qemu-devel] [PATCH] posix-aio-compat: fix latency issues
2011/8/8 Avi Kivity a...@redhat.com: On 08/08/2011 03:49 PM, Frediano Ziglio wrote: 2011/8/8 Avi Kivitya...@redhat.com: In certain circumstances, posix-aio-compat can incur a lot of latency: - threads are created by vcpu threads, so if vcpu affinity is set, aio threads inherit vcpu affinity. This can cause many aio threads to compete for one cpu. - we can create up to max_threads (64) aio threads in one go; since a pthread_create can take around 30μs, we have up to 2ms of cpu time under a global lock. Fix by: - moving thread creation to the main thread, so we inherit the main thread's affinity instead of the vcpu thread's affinity. - if a thread is currently being created, and we need to create yet another thread, let thread being born create the new thread, reducing the amount of time we spend under the main thread. - drop the local lock while creating a thread (we may still hold the global mutex, though) Note this doesn't eliminate latency completely; scheduler artifacts or lack of host cpu resources can still cause it. We may want pre-allocated threads when this cannot be tolerated. Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions. Signed-off-by: Avi Kivitya...@redhat.com Why not calling pthread_attr_setaffinity_np (where available) before thread creation or shed_setaffinity at thread start instead of telling another thread to create a thread for us just to get affinity cleared? The entire qemu process may be affined to a subset of the host cpus; we don't want to break that. For example: taskset 0xf0 qemu (qemu) info cpus pin individual vcpu threads to host cpus Just call sched_getaffinity at program start, save to a global variable and then set this affinity for io threads. I didn't use affinity that much but from manual it seems that if you own process you can set affinity as you like. IMHO this patch introduce a delay in io thread creation due to posting thread creation to another thread just to set different affinity. Frediano
Re: [Qemu-devel] [PATCH v2 0.15.0] qcow2: Fix L1 table size after bdrv_snapshot_goto
2011/8/4 Kevin Wolf kw...@redhat.com: When loading an internal snapshot whose L1 table is smaller than the current L1 table, the size of the current L1 would be shrunk to the snapshot's L1 size in memory, but not on disk. This lead to incorrect refcount updates and eventuelly to image corruption. Instead of writing the new L1 size to disk, this simply retains the bigger L1 size that is currently in use and makes sure that the unused part is zeroed. Signed-off-by: Kevin Wolf kw...@redhat.com --- And the moment you send it out, you notice that it's wrong... *sigh* v2: - Check for s-l1_size sn-l1_size in order to avoid disasters... Philipp, I think this should fix your corruption. Please give it a try. Anthony, this must go into 0.15. Given the short time until -rc2, do you prefer to pick it up directly or should I send a pull request tomorrow? The patch looks obvious, is tested with the given testcase and survives a basic qemu-iotests run (though qemu-iotests doesn't exercise snapshots a lot) Stefan, please review :-) block/qcow2-snapshot.c | 5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 74823a5..6972e66 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -330,8 +330,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) if (qcow2_grow_l1_table(bs, sn-l1_size, true) 0) goto fail; - s-l1_size = sn-l1_size; + if (s-l1_size sn-l1_size) { + memset(s-l1_table + sn-l1_size, 0, s-l1_size - sn-l1_size); + } l1_size2 = s-l1_size * sizeof(uint64_t); + /* copy the snapshot l1 table to the current l1 table */ if (bdrv_pread(bs-file, sn-l1_table_offset, s-l1_table, l1_size2) != l1_size2) -- 1.7.6 This patch looked odd at first sight. First a qcow2_grow_l1_table is called to shrink L1 so perhaps should be qcow2_resize_l1_table. Perhaps also it would be better to clean entries in qcow2_grow_l1_table instead of qcow2_snapshot_goto to avoid same problem in different calls to qcow2_grow_l1_table. The other oddity (still to understand) is: why does some code use l1_table above l1_size ?? Frediano
[Qemu-devel] Possible corruption on qcow2_snapshot_goto
This function seems to be not that safe. The first problem is that the first thing it does if free refcount calling qcow2_update_snapshot_refcount(bs, s-l1_table_offset, s-l1_size, -1). Now if something goes wrong (crash, disk errors) this could lead to refcount == 0 (depending on L1 size, cache, open flags). If table grow and qcow2_grow_l1_table needs to resize on disk it probably will found some reference counter set to 0 (cause we already decremented it), fail than qcow2_snapshot_goto return -EIO leaving refcounts deallocated. Frediano
Re: [Qemu-devel] [PATCH v2 0.15.0] qcow2: Fix L1 table size after bdrv_snapshot_goto
2011/8/5 Kevin Wolf kw...@redhat.com: Am 05.08.2011 08:35, schrieb Frediano Ziglio: 2011/8/4 Kevin Wolf kw...@redhat.com: When loading an internal snapshot whose L1 table is smaller than the current L1 table, the size of the current L1 would be shrunk to the snapshot's L1 size in memory, but not on disk. This lead to incorrect refcount updates and eventuelly to image corruption. Instead of writing the new L1 size to disk, this simply retains the bigger L1 size that is currently in use and makes sure that the unused part is zeroed. Signed-off-by: Kevin Wolf kw...@redhat.com --- And the moment you send it out, you notice that it's wrong... *sigh* v2: - Check for s-l1_size sn-l1_size in order to avoid disasters... Philipp, I think this should fix your corruption. Please give it a try. Anthony, this must go into 0.15. Given the short time until -rc2, do you prefer to pick it up directly or should I send a pull request tomorrow? The patch looks obvious, is tested with the given testcase and survives a basic qemu-iotests run (though qemu-iotests doesn't exercise snapshots a lot) Stefan, please review :-) block/qcow2-snapshot.c | 5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 74823a5..6972e66 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -330,8 +330,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) if (qcow2_grow_l1_table(bs, sn-l1_size, true) 0) goto fail; - s-l1_size = sn-l1_size; + if (s-l1_size sn-l1_size) { + memset(s-l1_table + sn-l1_size, 0, s-l1_size - sn-l1_size); + } l1_size2 = s-l1_size * sizeof(uint64_t); + /* copy the snapshot l1 table to the current l1 table */ if (bdrv_pread(bs-file, sn-l1_table_offset, s-l1_table, l1_size2) != l1_size2) -- 1.7.6 This patch looked odd at first sight. First a qcow2_grow_l1_table is called to shrink L1 so perhaps should be qcow2_resize_l1_table. No, it doesn't shrink the table: if (min_size = s-l1_size) return 0; Yes, but perhaps returning success and not clipping anything is not that correct, perhaps qcow2_snapshot_goto should not call qcow2_grow_l1_table with a shorter value. Perhaps also it would be better to clean entries in qcow2_grow_l1_table instead of qcow2_snapshot_goto to avoid same problem in different calls to qcow2_grow_l1_table. The other oddity (still to understand) is: why does some code use l1_table above l1_size ?? Which code do you mean specifically? Kevin I think this is the issue # 1204 - 128 cluster per L2 entry - 128k per L2 entry # 128 cluster per L1 entry - 16M per L1 entry qemu-img create -f qcow2 /tmp/sn.qcow2 16m -o cluster_size=1024 qemu-img snapshot -c foo /tmp/sn.qcow2 # extend L1 qemu-io -c 'write -b 0 4M' /tmp/sn.qcow2 # shrink qemu-img snapshot -a foo /tmp/sn.qcow2 qemu-img check /tmp/sn.qcow2 Well... I was trying to get a leak and got a core instead. I removed your patch and got leaks. also, should not be memset(s-l1_table + sn-l1_size, 0, (s-l1_size - sn-l1_size) * sizeof(uint64_t)); instead of memset(s-l1_table + sn-l1_size, 0, s-l1_size - sn-l1_size); Frediano
Re: [Qemu-devel] [PATCH v2 0.15.0] qcow2: Fix L1 table size after bdrv_snapshot_goto
2011/8/5 Frediano Ziglio fredd...@gmail.com: 2011/8/5 Kevin Wolf kw...@redhat.com: Am 05.08.2011 08:35, schrieb Frediano Ziglio: 2011/8/4 Kevin Wolf kw...@redhat.com: When loading an internal snapshot whose L1 table is smaller than the current L1 table, the size of the current L1 would be shrunk to the snapshot's L1 size in memory, but not on disk. This lead to incorrect refcount updates and eventuelly to image corruption. Instead of writing the new L1 size to disk, this simply retains the bigger L1 size that is currently in use and makes sure that the unused part is zeroed. Signed-off-by: Kevin Wolf kw...@redhat.com --- And the moment you send it out, you notice that it's wrong... *sigh* v2: - Check for s-l1_size sn-l1_size in order to avoid disasters... Philipp, I think this should fix your corruption. Please give it a try. Anthony, this must go into 0.15. Given the short time until -rc2, do you prefer to pick it up directly or should I send a pull request tomorrow? The patch looks obvious, is tested with the given testcase and survives a basic qemu-iotests run (though qemu-iotests doesn't exercise snapshots a lot) Stefan, please review :-) block/qcow2-snapshot.c | 5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 74823a5..6972e66 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -330,8 +330,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) if (qcow2_grow_l1_table(bs, sn-l1_size, true) 0) goto fail; - s-l1_size = sn-l1_size; + if (s-l1_size sn-l1_size) { + memset(s-l1_table + sn-l1_size, 0, s-l1_size - sn-l1_size); + } l1_size2 = s-l1_size * sizeof(uint64_t); + /* copy the snapshot l1 table to the current l1 table */ if (bdrv_pread(bs-file, sn-l1_table_offset, s-l1_table, l1_size2) != l1_size2) -- 1.7.6 This patch looked odd at first sight. First a qcow2_grow_l1_table is called to shrink L1 so perhaps should be qcow2_resize_l1_table. No, it doesn't shrink the table: if (min_size = s-l1_size) return 0; Yes, but perhaps returning success and not clipping anything is not that correct, perhaps qcow2_snapshot_goto should not call qcow2_grow_l1_table with a shorter value. Perhaps also it would be better to clean entries in qcow2_grow_l1_table instead of qcow2_snapshot_goto to avoid same problem in different calls to qcow2_grow_l1_table. The other oddity (still to understand) is: why does some code use l1_table above l1_size ?? Which code do you mean specifically? Kevin I think this is the issue # 1204 - 128 cluster per L2 entry - 128k per L2 entry # 128 cluster per L1 entry - 16M per L1 entry qemu-img create -f qcow2 /tmp/sn.qcow2 16m -o cluster_size=1024 qemu-img snapshot -c foo /tmp/sn.qcow2 # extend L1 qemu-io -c 'write -b 0 4M' /tmp/sn.qcow2 # shrink qemu-img snapshot -a foo /tmp/sn.qcow2 qemu-img check /tmp/sn.qcow2 Well... I was trying to get a leak and got a core instead. I removed your patch and got leaks. also, should not be memset(s-l1_table + sn-l1_size, 0, (s-l1_size - sn-l1_size) * sizeof(uint64_t)); instead of memset(s-l1_table + sn-l1_size, 0, s-l1_size - sn-l1_size); Frediano This patch fix Philipp and my problem diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 74823a5..0e5fc13 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -330,14 +330,16 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) if (qcow2_grow_l1_table(bs, sn-l1_size, true) 0) goto fail; -s-l1_size = sn-l1_size; -l1_size2 = s-l1_size * sizeof(uint64_t); +if (s-l1_size sn-l1_size) { +memset(s-l1_table + sn-l1_size, 0, (s-l1_size - sn-l1_size) * sizeof(uint64_t)); +} +l1_size2 = sn-l1_size * sizeof(uint64_t); /* copy the snapshot l1 table to the current l1 table */ if (bdrv_pread(bs-file, sn-l1_table_offset, s-l1_table, l1_size2) != l1_size2) goto fail; if (bdrv_pwrite_sync(bs-file, s-l1_table_offset, -s-l1_table, l1_size2) 0) +s-l1_table, s-l1_size * sizeof(uint64_t)) 0) goto fail; for(i = 0;i s-l1_size; i++) { be64_to_cpus(s-l1_table[i]); Frediano
Re: [Qemu-devel] qcow2x
2011/8/2 Kevin Wolf kw...@redhat.com: Am 02.08.2011 17:29, schrieb Frediano Ziglio: - L2 allocation can be done with relative data (this is not easy to do with current code) What do you mean by that? Let's take an example. By allocation I mean give a position to data/l2/refcount_table. Usually you cannot update/write L2 before data is allocated and written if you don't want to get a L2 entry pointing to garbage or unwritten data (if physically you write to a sector you get new data or old one on fail, not data changing to anything else). The exception to this is when even L2 table is not allocated. In this case you can write L2 table with data cause in case of failure this new L2 table is just not attached to anything (cause L1 still point to old L2 or is not allocated). My patch can collapse these two cluster writes in a single one. The key point of the patch is mainly collapsing all writes as you can not blocking other writes if not needed. Ok, I see. Not sure if it makes any measurable difference. With 64k clusters, an L2 allocation happens only every 512 MB. But if it's not much code to support it, it's probably a good thing. Let's see :) I published a repo at gitorious. https://gitorious.org/qemu (yes, qemu was free for me :) ) Currently there are 2 branches, qcow2x and my_cleanup. I think gitk interface is much better. From my_cleanup branch from qcow: allocate QCowAIOCB structure using stack to qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev there are cleanups for qcow while from qcow2: removed unused fields to qcow2: remove memory leak there are cleanups for qcow2. ... omissis ... Works for me. Just anything that I can pull from. Are you in the #qemu IRC channel? I think we should coordinate our qcow2 work a bit in order to avoid conflicting or duplicate work. No, I don't use irc that much (time shift problems and also connection too). When are you online? Usually until something like 18:00 CEST (16:00 UTC). Kevin :( me from 18:00 CEST Frediano
[Qemu-devel] qcow2x
Hi, I spent some time trying to find a way to speed up qcow2 performance on allocation and snapshot so I branch from kevin/coroutine-block branch a qcow2x branch. Currently it just write using different algorithm (that is is fully compatible with qcow2, there is not ICountAsZero(TM) method :) ). Mainly is a complete rewrite of qcow2_co_writev. Some problems I encountered in the way current qcow2 code works - reference decrement are not optimized (well, this is easy to fix on current code) - any L2 update blocks all other L2 updates, this is a problem if guest is writing large file sequentially cause most write needs to be serialized - L2 allocation can be done with relative data (this is not easy to do with current code) - data/l2 are allocated sequentially (if there are not hole freed) but written in another order. This cause excessive file fragmentation with default cache mode, for instance on xfs file is allocated quite sequentially on every write so any no-sequential write create a different fragment. Currently I'm getting these times with iotests (my_cleanup branch is another branch more conservative with a patch to collapse reference decrement, note that 011 and 026 are missing, still not working) XCB 001 637 002 334 003 333 004 010 005 000 007 35 32 36 008 343 009 100 010 000 012 002 013 125 err 158 014 189 err 203 015 48 70 610 017 444 018 555 019 444 020 444 021 000 022 74 103 103 023 75 err 95 024 333 025 336 027 110 028 111 X qcow2x C my_cleanup B kevin/coroutine-block Currently code is quite spaghetti code (needs a lot of cleanup, checks, better error handling and so on). Taking into account that code require additional optimizations and is full of internal debugging time times are quite good. Main questions are: - are somebody interesting ? - how can I send such a patch for review considering that is quite big (I know, I have to clean a bit too) ? Regards, Frediano
Re: [Qemu-devel] [PATCH 00/10] block: Coroutine support
2011/8/2 Kevin Wolf kw...@redhat.com: Am 02.08.2011 16:23, schrieb Avi Kivity: On 07/26/2011 02:48 PM, Kevin Wolf wrote: Depends on Stefan's latest coroutine patches. This series makes qcow and qcow2 take advantage of the new coroutine infrastructure. Both formats used synchronous operations for accessing their metadata and blocked the guest CPU during that time. With coroutines, the I/O will happen asynchronously in the background and the CPU won't be blocked any more. Do you plan to convert qcow2 to a fully synchronous design? IMO that will make it more maintainable. Cancellation will need some thought, though. After this patch series, all interesting paths are free of callbacks (I assume this is what you mean by synchronous?). The only thing I can see that is left is qcow2_aio_flush. What is required are some cleanups that eliminate things that still look like AIO code, and yes, that's something that I want to have. Frediano has posted some patches which I haven't fully reviewed yet, but the qcow1 RFC he posted was definitely a step in the right direction. Did I send patches for qcow2? I just rebased them with your last updates, I'll send them again. Regarding cancellation, I don't know any driver that really does what it's supposed to do. There are basically two ways of implementing it in current code: Either by completing the request instead of cancelling, or it's broken. I'd suggest that we implement waiting for completion as a generic function in the block layer and be done with it (actually this is what happens with bdrv_aio_co_cancel_em, it just could be a bit finer grained). Kevin Frediano
Re: [Qemu-devel] qcow2x
2011/8/2 Kevin Wolf kw...@redhat.com: Am 02.08.2011 16:30, schrieb Frediano Ziglio: Hi, I spent some time trying to find a way to speed up qcow2 performance on allocation and snapshot so I branch from kevin/coroutine-block branch a qcow2x branch. Currently it just write using different algorithm (that is is fully compatible with qcow2, there is not ICountAsZero(TM) method :) ). Mainly is a complete rewrite of qcow2_co_writev. Some problems I encountered in the way current qcow2 code works Do you have a public git repo of your code? - reference decrement are not optimized (well, this is easy to fix on current code) - any L2 update blocks all other L2 updates, this is a problem if guest is writing large file sequentially cause most write needs to be serialized Yes, I am well aware of that. In my old coroutine-devel branch (it's still online) I started pushing the locks down and parallelising things this way. The code that is there is broken because the locking isn't completely right (L2 allocation vs. cache users of that L2, and somthing with refcounts). The changes to the cache should be about right, though. - L2 allocation can be done with relative data (this is not easy to do with current code) What do you mean by that? Let's take an example. By allocation I mean give a position to data/l2/refcount_table. Usually you cannot update/write L2 before data is allocated and written if you don't want to get a L2 entry pointing to garbage or unwritten data (if physically you write to a sector you get new data or old one on fail, not data changing to anything else). The exception to this is when even L2 table is not allocated. In this case you can write L2 table with data cause in case of failure this new L2 table is just not attached to anything (cause L1 still point to old L2 or is not allocated). My patch can collapse these two cluster writes in a single one. The key point of the patch is mainly collapsing all writes as you can not blocking other writes if not needed. - data/l2 are allocated sequentially (if there are not hole freed) but written in another order. This cause excessive file fragmentation with default cache mode, for instance on xfs file is allocated quite sequentially on every write so any no-sequential write create a different fragment. Currently I'm getting these times with iotests (my_cleanup branch is another branch more conservative with a patch to collapse reference decrement, note that 011 and 026 are missing, still not working) Note that qemu-iotests is often a good indicator, but the tools often show different behaviour from real guests, so you should also run benchmarks in a VM. I know, one reason is that guest usually do a lot of small write/read (probably this is how hardware work but I don't know this side that much, usually I didn't see request longer than 128 sectors). X C B 001 6 3 7 002 3 3 4 003 3 3 3 004 0 1 0 005 0 0 0 007 35 32 36 008 3 4 3 009 1 0 0 010 0 0 0 012 0 0 2 013 125 err 158 014 189 err 203 015 48 70 610 017 4 4 4 018 5 5 5 019 4 4 4 020 4 4 4 021 0 0 0 022 74 103 103 023 75 err 95 024 3 3 3 025 3 3 6 027 1 1 0 028 1 1 1 X qcow2x C my_cleanup B kevin/coroutine-block Currently code is quite spaghetti code (needs a lot of cleanup, checks, better error handling and so on). Taking into account that code require additional optimizations and is full of internal debugging time times are quite good. Main questions are: - are somebody interesting ? - how can I send such a patch for review considering that is quite big (I know, I have to clean a bit too) ? You'll need to split it up into reviewable pieces. But let me have a look at your git tree first. I have an account on gitorious but still my repo is only local. Do you suggest a different provider or gitorious is ok for you? Are you in the #qemu IRC channel? I think we should coordinate our qcow2 work a bit in order to avoid conflicting or duplicate work. Kevin No, I don't use irc that much (time shift problems and also connection too). When are you online? Frediano
Re: [Qemu-devel] Volume key in qcow3?
2011/7/28 Kevin Wolf kw...@redhat.com: Am 28.07.2011 10:05, schrieb Frediano Ziglio: Hi, I noted that AES encryption using qcow2 just use the password given as as key (and also truncating it to 16 bytes == 128 bits). This is prone to brute force attacks and is not also easy to change password (you have to decrypt and encrypt again the entire image). LUKS and EncFS use another way. They generate a random key (the volume key) then use the password you give to encrypt N times (where N is decided by security level or automatically based on time to decrypt the volume key. To change the password just give the old one, get the volume key and encrypt again using the new one. LUKS support also multiple slots to allow multiple password and even using an external key file. Obviously this require an additional extension to qcow2 so I think it require a new qcow3 format. Yes, once we have qcow3, adding things like this should be easy enough. I think the idea makes sense. Another thing to consider with encryption is that we don't encrypt metadata currently. I'm not entirely sure if this is a good or a bad thing. Metadata is relatively predictable and I think that might hurt the encryption? Though I'm really not an expert in this area. Kevin I don't think this is a big issue, usually sensitive data is not in knowing which clusters are allocated or in which sequence (perhaps looking at allocation strategy you can detect operating system and filesystem type), snapshot ids and descriptions are IMO more sensible. Do anybody though that qcow2 can support data-deduplication using refcounts? Well, in this case this is not possible with encrypted data. Frediano
[Qemu-devel] [PATCH 0/8] qcow2 cleanup for coroutine-block branch
This is a collection of patches that remove QCowAIOCB, useless with coroutines. It apply to Kevin coroutine-block branch. I leave all step I did to remove the structure, feel free to collapse some of them. I tested with iotests and got no changes (026 fails like coroutine-block branch). Frediano Ziglio (8): qcow2: removed unused fields qcow2: removed cur_nr_sectors field in QCowAIOCB qcow2: remove l2meta from QCowAIOCB qcow2: remove cluster_offset from QCowAIOCB qcow2: remove common from QCowAIOCB qcow2: reindent and use while before the big jump qcow2: removed QCowAIOCB entirely qcow2: remove memory leak block/qcow2.c | 396 - 1 files changed, 167 insertions(+), 229 deletions(-)
[Qemu-devel] [PATCH 1/8] qcow2: removed unused fields
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 10 +++--- 1 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index edc068e..5c454bb 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,11 +381,8 @@ typedef struct QCowAIOCB { uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; -bool is_write; QEMUIOVector hd_qiov; -QEMUBH *bh; QCowL2Meta l2meta; -QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; /* @@ -517,13 +514,12 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write, QCowAIOCB *acb) + void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; qemu_iovec_init(acb-hd_qiov, qiov-niov); @@ -543,7 +539,7 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); qemu_co_mutex_lock(s-lock); do { @@ -658,7 +654,7 @@ static int qcow2_co_writev(BlockDriverState *bs, QCowAIOCB acb; int ret; -qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1, acb); +qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, acb); s-cluster_cache_offset = -1; /* disable compressed cache */ qemu_co_mutex_lock(s-lock); -- 1.7.1
[Qemu-devel] [PATCH 2/8] qcow2: removed cur_nr_sectors field in QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 98 + 1 files changed, 43 insertions(+), 55 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 5c454bb..0cf4465 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -377,7 +377,6 @@ typedef struct QCowAIOCB { int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; -int cur_nr_sectors;/* number of sectors in current iteration */ uint64_t bytes_done; uint64_t cluster_offset; uint8_t *cluster_data; @@ -395,42 +394,22 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; - -/* post process the read buffer */ -if (!acb-cluster_offset) { -/* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, -acb-cluster_data, acb-cur_nr_sectors, 0, s-aes_decrypt_key); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); -qemu_iovec_from_buffer(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); -} -} - -acb-remaining_sectors -= acb-cur_nr_sectors; -acb-sector_num += acb-cur_nr_sectors; -acb-bytes_done += acb-cur_nr_sectors * 512; +int cur_nr_sectors; /* number of sectors in current iteration */ if (acb-remaining_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ -acb-cur_nr_sectors = acb-remaining_sectors; +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; if (s-crypt_method) { -acb-cur_nr_sectors = MIN(acb-cur_nr_sectors, +cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -acb-cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, acb-cluster_offset); if (ret 0) { return ret; } @@ -439,14 +418,14 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -acb-cur_nr_sectors * 512); +cur_nr_sectors * 512); if (!acb-cluster_offset) { if (bs-backing_hd) { /* read from the base image */ n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, acb-cur_nr_sectors); +acb-sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); @@ -457,11 +436,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) return ret; } } -return 1; } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * acb-cur_nr_sectors); -return 1; +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } } else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -472,9 +449,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_from_buffer(acb-hd_qiov, s-cluster_cache + index_in_cluster * 512, -512 * acb-cur_nr_sectors); - -return 1; +512 * cur_nr_sectors); } else { if ((acb-cluster_offset 511) != 0) { return -EIO; @@ -490,24 +465,37 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } -assert(acb-cur_nr_sectors = +assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_add(acb-hd_qiov, acb-cluster_data, -512 * acb-cur_nr_sectors); +512 * cur_nr_sectors); } BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, (acb-cluster_offset 9) + index_in_cluster, -acb-cur_nr_sectors, acb-hd_qiov); +cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return ret; } +if (s-crypt_method) { +qcow2_encrypt_sectors(s, acb-sector_num, acb-cluster_data, +acb-cluster_data, cur_nr_sectors, 0, s-aes_decrypt_key); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb
[Qemu-devel] [PATCH 3/8] qcow2: remove l2meta from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 15 --- 1 files changed, 8 insertions(+), 7 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 0cf4465..adf31ce 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -381,7 +381,6 @@ typedef struct QCowAIOCB { uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; -QCowL2Meta l2meta; } QCowAIOCB; /* @@ -514,8 +513,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; acb-cluster_offset = 0; -acb-l2meta.nb_clusters = 0; -qemu_co_queue_init(acb-l2meta.dependent_requests); return acb; } @@ -566,6 +563,10 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int n_end; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +QCowL2Meta l2meta; + +l2meta.nb_clusters = 0; +qemu_co_queue_init(l2meta.dependent_requests); if (acb-remaining_sectors == 0) { /* request completed */ @@ -579,12 +580,12 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) n_end = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors; ret = qcow2_alloc_cluster_offset(bs, acb-sector_num 9, -index_in_cluster, n_end, cur_nr_sectors, acb-l2meta); +index_in_cluster, n_end, cur_nr_sectors, l2meta); if (ret 0) { return ret; } -acb-cluster_offset = acb-l2meta.cluster_offset; +acb-cluster_offset = l2meta.cluster_offset; assert((acb-cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); @@ -618,9 +619,9 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -ret = qcow2_alloc_cluster_link_l2(bs, acb-l2meta); +ret = qcow2_alloc_cluster_link_l2(bs, l2meta); -run_dependent_requests(s, acb-l2meta); +run_dependent_requests(s, l2meta); if (ret 0) { return ret; -- 1.7.1
[Qemu-devel] [PATCH 8/8] qcow2: remove memory leak
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 6073568..aed8da7 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -492,6 +492,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +qemu_free(cluster_data); return ret; } @@ -604,6 +605,7 @@ fail: qemu_co_mutex_unlock(s-lock); qemu_iovec_destroy(hd_qiov); +qemu_free(cluster_data); return ret; } -- 1.7.1
[Qemu-devel] [PATCH 7/8] qcow2: removed QCowAIOCB entirely
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 207 ++--- 1 files changed, 80 insertions(+), 127 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index dfb969e..6073568 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -372,83 +372,77 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, return n1; } -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -int remaining_sectors; -uint64_t bytes_done; -uint8_t *cluster_data; -QEMUIOVector hd_qiov; -} QCowAIOCB; - -/* - * Returns 0 when the request is completed successfully, 1 when there is still - * a part left to do and -errno in error cases. - */ -static int qcow2_aio_read_cb(QCowAIOCB *acb) +static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; +uint64_t bytes_done = 0; +QEMUIOVector hd_qiov; +uint8_t *cluster_data = NULL; -while (acb-remaining_sectors != 0) { +qemu_iovec_init(hd_qiov, qiov-niov); + +qemu_co_mutex_lock(s-lock); + +while (remaining_sectors != 0) { /* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; +cur_nr_sectors = remaining_sectors; if (s-crypt_method) { cur_nr_sectors = MIN(cur_nr_sectors, QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +ret = qcow2_get_cluster_offset(bs, sector_num 9, cur_nr_sectors, cluster_offset); if (ret 0) { -return ret; +goto fail; } -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); +index_in_cluster = sector_num (s-cluster_sectors - 1); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +qemu_iovec_reset(hd_qiov); +qemu_iovec_copy(hd_qiov, qiov, bytes_done, cur_nr_sectors * 512); if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); +n1 = qcow2_backing_read1(bs-backing_hd, hd_qiov, +sector_num, cur_nr_sectors); if (n1 0) { BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n1, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return ret; +goto fail; } } } else { /* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +qemu_iovec_memset(hd_qiov, 0, 512 * cur_nr_sectors); } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { -return ret; +goto fail; } -qemu_iovec_from_buffer(acb-hd_qiov, +qemu_iovec_from_buffer(hd_qiov, s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { if ((cluster_offset 511) != 0) { -return -EIO; +ret = -EIO; +goto fail; } if (s-crypt_method) { @@ -456,15 +450,15 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) * For encrypted images, read everything into a temporary * contiguous buffer on which the AES functions can work. */ -if (!acb-cluster_data) { -acb-cluster_data = +if (!cluster_data) { +cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); } assert(cur_nr_sectors = QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_reset(hd_qiov
[Qemu-devel] [PATCH 4/8] qcow2: remove cluster_offset from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 22 +++--- 1 files changed, 11 insertions(+), 11 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index adf31ce..446946e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -378,7 +378,6 @@ typedef struct QCowAIOCB { QEMUIOVector *qiov; int remaining_sectors; uint64_t bytes_done; -uint64_t cluster_offset; uint8_t *cluster_data; QEMUIOVector hd_qiov; } QCowAIOCB; @@ -394,6 +393,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ +uint64_t cluster_offset = 0; if (acb-remaining_sectors == 0) { /* request completed */ @@ -408,7 +408,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) } ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, acb-cluster_offset); +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } @@ -419,7 +419,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, cur_nr_sectors * 512); -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ @@ -439,9 +439,9 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) /* Note: in this case, no need to wait */ qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, acb-cluster_offset); +ret = qcow2_decompress_cluster(bs, cluster_offset); if (ret 0) { return ret; } @@ -450,7 +450,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } @@ -474,7 +474,7 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-file, -(acb-cluster_offset 9) + index_in_cluster, +(cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { @@ -512,7 +512,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, acb-bytes_done = 0; acb-remaining_sectors = nb_sectors; -acb-cluster_offset = 0; return acb; } @@ -564,6 +563,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) int ret; int cur_nr_sectors; /* number of sectors in current iteration */ QCowL2Meta l2meta; +uint64_t cluster_offset; l2meta.nb_clusters = 0; qemu_co_queue_init(l2meta.dependent_requests); @@ -585,8 +585,8 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) return ret; } -acb-cluster_offset = l2meta.cluster_offset; -assert((acb-cluster_offset 511) == 0); +cluster_offset = l2meta.cluster_offset; +assert((cluster_offset 511) == 0); qemu_iovec_reset(acb-hd_qiov); qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, @@ -612,7 +612,7 @@ static int qcow2_aio_write_cb(QCowAIOCB *acb) BLKDBG_EVENT(bs-file, BLKDBG_WRITE_AIO); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_writev(bs-file, - (acb-cluster_offset 9) + index_in_cluster, + (cluster_offset 9) + index_in_cluster, cur_nr_sectors, acb-hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -- 1.7.1
[Qemu-devel] Volume key in qcow3?
Hi, I noted that AES encryption using qcow2 just use the password given as as key (and also truncating it to 16 bytes == 128 bits). This is prone to brute force attacks and is not also easy to change password (you have to decrypt and encrypt again the entire image). LUKS and EncFS use another way. They generate a random key (the volume key) then use the password you give to encrypt N times (where N is decided by security level or automatically based on time to decrypt the volume key. To change the password just give the old one, get the volume key and encrypt again using the new one. LUKS support also multiple slots to allow multiple password and even using an external key file. Obviously this require an additional extension to qcow2 so I think it require a new qcow3 format. Frediano
[Qemu-devel] [PATCH 5/8] qcow2: remove common from QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 446946e..c7445cc 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -373,7 +373,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, } typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; int remaining_sectors; @@ -388,7 +388,7 @@ typedef struct QCowAIOCB { */ static int qcow2_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster, n1; int ret; @@ -504,7 +504,7 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, void *opaque, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -556,7 +556,7 @@ static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m) */ static int qcow2_aio_write_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; int n_end; -- 1.7.1
[Qemu-devel] [PATCH 6/8] qcow2: reindent and use while before the big jump
prepare to remove read/write callbacks Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2.c | 272 - 1 files changed, 135 insertions(+), 137 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index c7445cc..dfb969e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -395,107 +395,105 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset = 0; -if (acb-remaining_sectors == 0) { -/* request completed */ -return 0; -} - -/* prepare next request */ -cur_nr_sectors = acb-remaining_sectors; -if (s-crypt_method) { -cur_nr_sectors = MIN(cur_nr_sectors, -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -} +while (acb-remaining_sectors != 0) { -ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, -cur_nr_sectors, cluster_offset); -if (ret 0) { -return ret; -} - -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); - -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, -cur_nr_sectors * 512); - -if (!cluster_offset) { - -if (bs-backing_hd) { -/* read from the base image */ -n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, -acb-sector_num, cur_nr_sectors); -if (n1 0) { -BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); -qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -n1, acb-hd_qiov); -qemu_co_mutex_lock(s-lock); -if (ret 0) { -return ret; -} -} -} else { -/* Note: in this case, no need to wait */ -qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +/* prepare next request */ +cur_nr_sectors = acb-remaining_sectors; +if (s-crypt_method) { +cur_nr_sectors = MIN(cur_nr_sectors, +QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); } -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -ret = qcow2_decompress_cluster(bs, cluster_offset); + +ret = qcow2_get_cluster_offset(bs, acb-sector_num 9, +cur_nr_sectors, cluster_offset); if (ret 0) { return ret; } -qemu_iovec_from_buffer(acb-hd_qiov, -s-cluster_cache + index_in_cluster * 512, -512 * cur_nr_sectors); -} else { -if ((cluster_offset 511) != 0) { -return -EIO; -} +index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -if (s-crypt_method) { -/* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ -if (!acb-cluster_data) { -acb-cluster_data = -qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s-cluster_size); +qemu_iovec_reset(acb-hd_qiov); +qemu_iovec_copy(acb-hd_qiov, acb-qiov, acb-bytes_done, +cur_nr_sectors * 512); + +if (!cluster_offset) { + +if (bs-backing_hd) { +/* read from the base image */ +n1 = qcow2_backing_read1(bs-backing_hd, acb-hd_qiov, +acb-sector_num, cur_nr_sectors); +if (n1 0) { +BLKDBG_EVENT(bs-file, BLKDBG_READ_BACKING_AIO); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +n1, acb-hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +return ret; +} +} +} else { +/* Note: in this case, no need to wait */ +qemu_iovec_memset(acb-hd_qiov, 0, 512 * cur_nr_sectors); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +ret = qcow2_decompress_cluster(bs, cluster_offset); +if (ret 0) { +return ret; } -assert(cur_nr_sectors = -QCOW_MAX_CRYPT_CLUSTERS * s-cluster_sectors); -qemu_iovec_reset(acb-hd_qiov); -qemu_iovec_add(acb-hd_qiov, acb-cluster_data, +qemu_iovec_from_buffer(acb-hd_qiov, +s-cluster_cache + index_in_cluster * 512, 512 * cur_nr_sectors); -} +} else { +if ((cluster_offset 511) != 0) { +return -EIO; +} -BLKDBG_EVENT(bs-file
Re: [Qemu-devel] Block layer roadmap
2011/7/28 Christoph Hellwig h...@lst.de: On Wed, Jul 27, 2011 at 01:37:31PM +0100, Stefan Hajnoczi wrote: Coroutines in the block layer [Kevin] * Programming model to simplify block drivers without blocking QEMU threads Can anyone explain what the whole point of this is? It really just is a bit of syntactic sugar for the current async state machines. What does it buy us over going for real threading? This has nothing (or few) to do with threads. Instead of splitting functions in callbacks at every synchronous function it allow to write more readable code. For instance after changing qcow code from current to coroutine you remove about 400 lines (about 30%). This will help maintaining code and develop more complicated optimizations. About threading you can do threading using AIO and using coroutines. Frediano
[Qemu-devel] [PATCH] [RFC] qcow2: group refcount updates during cow
Well, I think this is the first real improve patch. Is more a RFC than a patch. Yes, some lines are terrible! It collapses refcount decrement during cow. From a first check time executing 015 test passed from about 600 seconds to 70. This at least prove that refcount updates counts! Some doubt: 1- place the code in qcow2-refcount.c as it update only refcount and not cluster? 2- allow some sort of begin transaction / commit / rollback like databases instead? 3- allow changing tables from different coroutines? 1) If you have a sequence like (1, 2, 4) probably these clusters are all in the same l2 table but with this code you get two write instead of one. I'm thinking about a function in qcow2-refcount.c that accept an array of cluster instead of a start + len. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow2-cluster.c | 36 ++-- 1 files changed, 34 insertions(+), 2 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 81cf77d..da17365 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -675,10 +675,42 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) * Also flush bs-file to get the right order for L2 and refcount update. */ if (j != 0) { +int64_t old_start = 0, old_end = -2; +int count = 0; for (i = 0; i j; i++) { -qcow2_free_any_clusters(bs, -be64_to_cpu(old_cluster[i]) ~QCOW_OFLAG_COPIED, 1); +old_cluster[i] = be64_to_cpu(old_cluster[i]) ~QCOW_OFLAG_COPIED; } +// XXX sort old_cluster +for (i = 0; i j; i++) { +int64_t cluster = old_cluster[i]; + +/* group if contiguos */ +if (old_end + 1 == (cluster s-cluster_bits)) { +++old_end; +continue; +} + +/* handle */ +if (old_end 0) { +qcow2_free_any_clusters(bs, old_start s-cluster_bits, old_end - old_start + 1); +count += old_end - old_start + 1; +} +old_end = -2; + +/* handle compressed separately */ +if ((cluster QCOW_OFLAG_COMPRESSED)) { +qcow2_free_any_clusters(bs, cluster, 1); +continue; +} + +/* start a new group */ +old_start = old_end = cluster s-cluster_bits; +} +if (old_end 0) { +qcow2_free_any_clusters(bs, old_start s-cluster_bits, old_end - old_start + 1); +count += old_end - old_start + 1; +} +assert(count == j); } ret = 0; -- 1.7.1
[Qemu-devel] [PATCH 0/3] small trivial patches
Minor fixes and improves Frediano Ziglio (3): block: removed unused function block: typo fix aio: always check paio_init result block.c | 13 - block.h |2 -- block/raw-posix.c | 13 ++--- 3 files changed, 6 insertions(+), 22 deletions(-)
[Qemu-devel] [PATCH 1/3] block: removed unused function
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block.c | 13 - block.h |2 -- 2 files changed, 0 insertions(+), 15 deletions(-) diff --git a/block.c b/block.c index 9549b9e..b2d3983 100644 --- a/block.c +++ b/block.c @@ -1108,19 +1108,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, -const uint8_t *buf, int nb_sectors) -{ -return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num, -buf, BDRV_SECTOR_SIZE * nb_sectors); -} - /** * Truncate file to 'offset' bytes (needed only for file protocols) */ diff --git a/block.h b/block.h index 59cc410..e672bc6 100644 --- a/block.h +++ b/block.h @@ -85,8 +85,6 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, const void *buf, int count); int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, -const uint8_t *buf, int nb_sectors); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); -- 1.7.1
[Qemu-devel] [PATCH 2/3] block: typo fix
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/raw-posix.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index cd89c83..275b41e 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -587,7 +587,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, /* * If O_DIRECT is used the buffer needs to be aligned on a sector - * boundary. Check if this is the case or telll the low-level + * boundary. Check if this is the case or tell the low-level * driver that it needs to copy the buffer. */ if (s-aligned_buf) { -- 1.7.1