Re: [Qemu-devel] [PATCH 14/22] usb: Add packet combining functions

2012-10-18 Thread Gerd Hoffmann
  Hi,

 For handle_combined_data, yes, as usb_ep_combine_input_packets can
 cause multiple packets to get submitted, since if a combined packet
 ends with a packet, which does not have its short_not_ok flag set,
 another packet can be safely pipelined after it. This is not
 useful for usb mass storage, but very usefull for usb serial
 port converters.

Ah, I see.  You can have a queue with -- say -- 16 packets which will
get combined into 4 groups with 4 packets each - 4 callbacks.

I think handle_combined_data() should get USBCombinedPacket passed
instead of USBPacket.  It is cleaner API-wise.  Likewise for the other
usb_combined_* functions.

Allowing to call usb_ep_combine_input_packets on any endpoint (except
iso which is a special case anyway) would be good too I think, then it
is possible for usb drivers to operate on USBCombinedPackets everywhere.

BTW:  I think USBPacketGroup would be a better name for USBCombinedPacket.

cheers,
  Gerd




[Qemu-devel] [PATCH 03/15] pseries: Implement qemu initiated shutdowns using EPOW events

2012-10-18 Thread David Gibson
At present, using 'system_powerdown' from the monitor or otherwise
instructing qemu to (cleanly) shut down a pseries guest will not work,
because we did not have a method of signalling the shutdown request to the
guest.

PAPR does include a usable mechanism for this, though it is rather more
involved than the equivalent on x86.  This involves sending an EPOW
(Environmental and POwer Warning) event through the PAPR event and error
logging mechanism, which also has a number of other functions.

This patch implements just enough of the event/error logging functionality
to be able to send a shutdown event to the guest.  At least with modern
guest kernels and a userspace that is up and running, this means that
system_powerdown from the qemu monitor should now work correctly on pseries
guests.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
v2:
 * Coding style cleanups suggested by Blue Swirl
---
 hw/ppc/Makefile.objs |1 +
 hw/spapr.c   |   14 ++-
 hw/spapr.h   |   11 ++
 hw/spapr_events.c|  320 ++
 4 files changed, 344 insertions(+), 2 deletions(-)
 create mode 100644 hw/spapr_events.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 951e407..8fe2123 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -11,6 +11,7 @@ obj-y += ppc_newworld.o
 obj-$(CONFIG_PSERIES) += spapr.o spapr_hcall.o spapr_rtas.o spapr_vio.o
 obj-$(CONFIG_PSERIES) += xics.o spapr_vty.o spapr_llan.o spapr_vscsi.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o pci-hotplug.o spapr_iommu.o
+obj-$(CONFIG_PSERIES) += spapr_events.o
 # PowerPC 4xx boards
 obj-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
 obj-y += ppc440_bamboo.o
diff --git a/hw/spapr.c b/hw/spapr.c
index 09b8e99..64c35a8 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -232,7 +232,8 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
target_phys_addr_t initrd_size,
target_phys_addr_t kernel_size,
const char *boot_device,
-   const char *kernel_cmdline)
+   const char *kernel_cmdline,
+   uint32_t epow_irq)
 {
 void *fdt;
 CPUPPCState *env;
@@ -403,6 +404,8 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
 _FDT((fdt_property(fdt, ibm,associativity-reference-points,
 refpoints, sizeof(refpoints;
 
+_FDT((fdt_property_cell(fdt, rtas-error-log-max, RTAS_ERROR_LOG_MAX)));
+
 _FDT((fdt_end_node(fdt)));
 
 /* interrupt controller */
@@ -433,6 +436,9 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
 
 _FDT((fdt_end_node(fdt)));
 
+/* event-sources */
+spapr_events_fdt_skel(fdt, epow_irq);
+
 _FDT((fdt_end_node(fdt))); /* close root node */
 _FDT((fdt_finish(fdt)));
 
@@ -794,6 +800,9 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 spapr-icp = xics_system_init(XICS_IRQS);
 spapr-next_irq = 16;
 
+/* Set up EPOW events infrastructure */
+spapr_events_init(spapr);
+
 /* Set up IOMMU */
 spapr_iommu_init();
 
@@ -902,7 +911,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 spapr-fdt_skel = spapr_create_fdt_skel(cpu_model,
 initrd_base, initrd_size,
 kernel_size,
-boot_device, kernel_cmdline);
+boot_device, kernel_cmdline,
+spapr-epow_irq);
 assert(spapr-fdt_skel != NULL);
 }
 
diff --git a/hw/spapr.h b/hw/spapr.h
index e984e3f..3d02911 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -26,6 +26,12 @@ typedef struct sPAPREnvironment {
 int rtc_offset;
 char *cpu_model;
 bool has_graphics;
+
+/* Data for Environmental and POwer Warnings (EPOW events) */
+uint32_t epow_irq;
+Notifier epow_notifier;
+void *pending_epow;
+uint32_t next_plid;
 } sPAPREnvironment;
 
 #define H_SUCCESS 0
@@ -335,7 +341,12 @@ typedef struct sPAPRTCE {
 #define SPAPR_VIO_BASE_LIOBN0x
 #define SPAPR_PCI_BASE_LIOBN0x8000
 
+#define RTAS_ERROR_LOG_MAX  2048
+
+
 void spapr_iommu_init(void);
+void spapr_events_init(sPAPREnvironment *spapr);
+void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq);
 DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size);
 void spapr_tce_free(DMAContext *dma);
 void spapr_tce_reset(DMAContext *dma);
diff --git a/hw/spapr_events.c b/hw/spapr_events.c
new file mode 100644
index 000..4b19612
--- /dev/null
+++ b/hw/spapr_events.c
@@ -0,0 +1,320 @@
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * RTAS events handling
+ *
+ * Copyright (c) 2012 David Gibson, IBM Corporation.
+ *
+ * Permission is hereby granted, free of 

Re: [Qemu-devel] [PATCH v11 01/14] target-mips: Add ASE DSP internal functions

2012-10-18 Thread Aurelien Jarno
On Thu, Oct 18, 2012 at 09:53:20AM +0800, Jia Liu wrote:
 Hi Aurelien,
 
 On Wed, Oct 17, 2012 at 11:15 PM, Aurelien Jarno aurel...@aurel32.net wrote:
  On Wed, Oct 17, 2012 at 11:39:00AM +0800, Jia Liu wrote:
  Hi Aurelien,
 
  On Wed, Oct 17, 2012 at 7:20 AM, Aurelien Jarno aurel...@aurel32.net 
  wrote:
   On Tue, Oct 16, 2012 at 12:39:05AM +0800, Jia Liu wrote:
   +/* a[0] is LO, a[1] is HI. */
   +static inline void mipsdsp_sat64_acc_sub_q63(int64_t *ret,
   + int32_t ac,
   + int64_t *a,
   + CPUMIPSState *env)
   +{
   +uint32_t temp64, temp63;
   +int64_t temp[2];
   +int64_t acc[2];
   +int64_t temp_sum;
   +
   +temp[0] = a[0];
   +temp[1] = a[1];
   +
   +acc[0] = env-active_tc.LO[ac];
   +acc[1] = env-active_tc.HI[ac];
   +
   +temp_sum = acc[0] - temp[0];
   +if (MIPSDSP_OVERFLOW(acc[0], -temp[0], temp_sum, 
   0x8000ull)) {
   +acc[1] -= 1;
   +}
   +acc[0] = temp_sum;
   +
   +temp_sum = acc[1] - temp[1];
   +acc[1] = temp_sum;
   +
   +temp64 = acc[1]  0x01;
   +temp63 = (acc[0]  63)  0x01;
   +
   +/* MIPSDSP_OVERFLOW only can check if a 64 bits sub is overflow,
   + * there are two 128 bits value subed then check the 63/64 bits 
   are equal
   + * or not.*/
  
   If you disagree with what I say, you can send mail, there is no need to
   put it as a comment.
  
   That said MIPSDSP_OVERFLOW doesn't work only on 64-bit values, it can
   work other size, as it is done elsewhere in this patch. The only thing
   it checked is the highest bit of the two arguments and the result.
   Therefore if you pass the highest part of the values, it can work.
  
 
  I did agree with you, just didn't totally get your point.
 
  MIPSDSP_OVERFLOW used to check overflow, but here, 128bit + 128bit,
  low 64bit overflow need to be checked, so, in fact, I'm not sure what
  should do. Is this code right?
 
  static inline void mipsdsp_sat64_acc_sub_q63(uint64_t *ret,
   int32_t ac,
   uint64_t *a,
   CPUMIPSState *env)
  {
  uint32_t temp64;
  uint64_t temp[2];
  uint64_t acc[2];
 
  temp[0] = a[0];
  temp[1] = a[1];
 
  acc[0] = env-active_tc.LO[ac];
  acc[1] = env-active_tc.HI[ac];
 
  temp[1] = acc[1] - temp[1];
  temp[0] = acc[0] - temp[0];
 
  temp64 = temp[1]  0x01;
 
  if (temp64 ^ MIPSDSP_OVERFLOW(acc[0], temp[0], temp[0], (0x01ull  
  63))) {
  if (temp64 == 1) {
  ret[0] = (0x01ull  63);
  ret[1] = ~0ull;
  } else {
  ret[0] = (0x01ull  63) - 1;
  ret[1] = 0x00;
  }
  set_DSPControl_overflow_flag(1, 16 + ac, env);
  } else {
  ret[0] = temp[0];
  ret[1] = acc[0]  temp[0] ? temp[1] : temp[1] - 1;
  }
  }
 
 
  I don't think xoring temp64 with MIPSDSP_OVERFLOW is correct. What about
  about something like that (untested):
 
  | static inline void mipsdsp_sat64_acc_sub_q63(uint64_t *ret,
  |  int32_t ac,
  |  uint64_t *a,
  |  CPUMIPSState *env)
  | {
  | ret[0] = env-active_tc.LO[ac] - a[0];
  | ret[1] = env-active_tc.HI[ac] - a[1];
  |
 
 In the MIPS-DSP manual, the function is
 
 function sat64AccumulateSubQ63( acc1..0, a127..0 )
 temp128..0 ← HI[acc]63 || HI[acc]63..0 || LO[acc]63..0
 temp128..0 ← temp - a127..0 if ( temp64 ≠ temp63 ) then
 if ( temp64 = 1 ) then
 temp127..0 ← 0x8000
 else
 temp127..0 ← 0x7FFF
 endif
 DSPControlouflag:16+acc ← 1 endif
 return temp127..0
 endfunction sat64AccumulateSubQ63
 
  | if (MIPSDSP_OVERFLOW(env-active_tc.LO[ac], -a[0], ret[0], 
  0x8000ull)) {
 
 The bit 64 will influence the overflow of bits 0-63.
 That is, if bit 64 is 1, and bits 0-63 overflowed, it won't be caught.
 So, if we use this code, it won't be handled.
 

I think you are correct, we have to take into account the overflow part,
but also the original value of the temp64 bit.

  | if ((ret[1] - 1)  1) {
  | ret[0] = (0x01ull  63);
  | ret[1] = ~0ull;
  | } else {
  | ret[0] = (0x01ull  63) - 1;
  | ret[1] = 0x00;
  | }
  | set_DSPControl_overflow_flag(1, 16 + ac, env);
  | }
  | }
  |
 
  The same applies for the add function, but also to some other places in
  the code.
 
  Also note that you might want to have two version of MIPSDSP_OVERFLOW(),
  one for add (like the current one), and one for sub (without the ^ -1),
  so that you don't have to pass the negative value 

[Qemu-devel] [PATCH v3] qemu-img: document 'info --backing-chain'

2012-10-18 Thread Kashyap Chamarthy
Signed-off-by: Kashyap Chamarthy kashyap...@gmail.com
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.texi| 21 -
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 
0ef82e9ac7a89d607e1f2c8e38461cee5b438dba..a18136302d2ca7a7540672f1b9ef89603e89edc0
 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -34,9 +34,9 @@ STEXI
 ETEXI
 
 DEF(info, img_info,
-info [-f fmt] [--output=ofmt] filename)
+info [-f fmt] [--output=ofmt] [--backing-chain] filename)
 STEXI
-@item info [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
+@item info [-f @var{fmt}] [--output=@var{ofmt}] [--backing-chain] 
@var{filename}
 ETEXI
 
 DEF(snapshot, img_snapshot,
diff --git a/qemu-img.texi b/qemu-img.texi
index 
8b05f2c42801a2535ab4390b47bc415eb880625a..fa4ced6e5a24eed0adf1b44b471d6959055a58a1
 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -28,6 +28,10 @@ Command parameters:
 is the disk image format. It is guessed automatically in most cases. See below
 for a description of the supported disk formats.
 
+@item --backing-chain
+will enumerate information about backing files in a disk image chain. Refer
+below for further description.
+
 @item size
 is the disk image size in bytes. Optional suffixes @code{k} or @code{K}
 (kilobyte, 1024) @code{M} (megabyte, 1024k) and @code{G} (gigabyte, 1024M)
@@ -129,7 +133,7 @@ created as a copy on write image of the specified base 
image; the
 @var{backing_file} should have the same content as the input's base image,
 however the path, image format, etc may differ.
 
-@item info [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
+@item info [-f @var{fmt}] [--output=@var{ofmt}] [--backing-chain] 
@var{filename}
 
 Give information about the disk image @var{filename}. Use it in
 particular to know the size reserved on disk which can be different
@@ -137,6 +141,21 @@ from the displayed size. If VM snapshots are stored in the 
disk image,
 they are displayed too. The command can output in the format @var{ofmt}
 which is either @code{human} or @code{json}.
 
+If a disk image has a backing file chain, information about each disk image in
+the chain can be recursively enumerated by using the option 
@code{--backing-chain}.
+
+For instance, if you have an image chain like:
+
+@example
+base.qcow2 - snap1.qcow2 - snap2.qcow2
+@end example
+
+To enumerate information about each disk image in the above chain, starting 
from top to base, do:
+
+@example
+qemu-img info --backing-chain snap2.qcow2
+@end example
+
 @item snapshot [-l | -a @var{snapshot} | -c @var{snapshot} | -d @var{snapshot} 
] @var{filename}
 
 List, apply, create or delete snapshots in image @var{filename}.
-- 
1.7.12.1




Re: [Qemu-devel] [PATCH 3/7] serial: add windows inf file for the pci card to docs

2012-10-18 Thread Gerd Hoffmann
On 10/17/12 22:12, Hervé Poussineau wrote:
 Signed-off-by: Gerd Hoffmann address@hidden
 
 ---
  docs/qemupciserial.inf |  109
 
 1 files changed, 109 insertions(+), 0 deletions(-)
 create mode 100644 docs/qemupciserial.inf
 [...]
 
 Did you try to add emulation of a PCI serial card natively recognized by
 Windows XP (at least),
 like SIIG CyberSerial card?

Thats the only one.  Looked at the linux code for it, a comment mumbled
something about a eeprom holding card configuration, /me figured I
better don't go that route.  If it would have been a simple card with
just a 16550  nothing else I would have picked it.

cheers,
  Gerd




Re: [Qemu-devel] [PATCH v2] pci: Return PCI_INTX_DISABLED when no bus INTx routing support

2012-10-18 Thread Jan Kiszka
On 2012-10-18 00:13, Alex Williamson wrote:
 Rather than assert, simply return PCI_INTX_DISABLED when we don't
 have a pci_route_irq_fn.  PIIX already returns DISABLED for an
 invalid pin, so users already deal with this state.  Users of this
 interface should only be acting on an ENABLED or INVERTED return
 value (though we really have no support for INVERTED).  Also
 complain loudly when we hit this so we don't forget it's missing.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
 
 v2: Turn up the annoyance factor for hitting this
 
  hw/pci.c |8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/hw/pci.c b/hw/pci.c
 index 83d262a..6a66b32 100644
 --- a/hw/pci.c
 +++ b/hw/pci.c
 @@ -1094,7 +1094,13 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice 
 *dev, int pin)
   pin = bus-map_irq(dev, pin);
   dev = bus-parent_dev;
  } while (dev);
 -assert(bus-route_intx_to_irq);
 +
 +if (!bus-route_intx_to_irq) {
 +error_report(PCI: Bug - unimplemented PCI INTx routing (%s)\n,
 + object_get_typename(OBJECT(bus-qbus.parent)));
 +return (PCIINTxRoute) { PCI_INTX_DISABLED, -1 };
 +}
 +
  return bus-route_intx_to_irq(bus-irq_opaque, pin);
  }
  
 

I'm fine with this. I also see this as dead code in x86 (any x86 chipset
will support this API, for sure), but maybe it helps on other archs. So:

Acked-by: Jan Kiszka jan.kis...@siemens.com

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux



[Qemu-devel] 1.1.1 - 1.1.2 migrate / managed save issue

2012-10-18 Thread Doug Goldstein
I'm using libvirt 0.10.2 and I had qemu-kvm 1.1.1 running all my VMs.
I used libvirt's managedsave command to pause all the VMs and write
them to disk and when I brought up the machine again I had upgraded to
qemu-kvm 1.1.2 and attempted to resume the VMs from their state. It
unfortunately fails. During the life of the VM I did not attempt to
adjust the amount of memory it had via the balloon device unless of
course libvirt did behind the scenes on me. Below is the command line
invocation and the error:

LC_ALL=C 
PATH=/bin:/sbin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:/opt/bin
HOME=/root USER=root QEMU_AUDIO_DRV=spice /usr/bin/qemu-kvm -name expo
-S -M pc-1.0 -cpu
Penryn,+pdcm,+xtpr,+tm2,+est,+smx,+vmx,+ds_cpl,+monitor,+dtes64,+pbe,+tm,+ht,+ss,+acpi,+ds,+vme
-enable-kvm -m 1024 -smp 1,sockets=1,cores=1,threads=1 -uuid
19034754-aa3f-9671-d247-1bc53134e3f0 -no-user-config -nodefaults
-chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/expo.monitor,server,nowait
-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc
-no-shutdown -device
virtio-serial-pci,id=virtio-serial0,bus=pci.0,addr=0x4 -device
piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive
file=/var/lib/libvirt/images/expo.img,if=none,id=drive-ide0-0-0,format=raw,cache=none
-device ide-hd,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0,bootindex=1
-netdev tap,fd=23,id=hostnet0 -device
rtl8139,netdev=hostnet0,id=net0,mac=52:54:00:0b:29:d9,bus=pci.0,addr=0x3
-chardev pty,id=charserial0 -device
isa-serial,chardev=charserial0,id=serial0 -chardev
spicevmc,id=charchannel0,name=vdagent -device
virtserialport,bus=virtio-serial0.0,nr=1,chardev=charchannel0,id=channel0,name=com.redhat.spice.0
-spice port=5901,addr=127.0.0.1,disable-ticketing -vga qxl -global
qxl-vga.vram_size=67108864 -incoming fd:20 -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5
char device redirected to /dev/pts/7
qemu: warning: error while loading state for instance 0x0 of device 'ram'
load of migration failed


-- 
Doug Goldstein



Re: [Qemu-devel] [PATCH 1/6] qdev: rework device properties.

2012-10-18 Thread Gerd Hoffmann
On 10/17/12 22:14, Eduardo Habkost wrote:
 On Wed, Jul 15, 2009 at 01:43:31PM +0200, Gerd Hoffmann wrote:
 [...]
 diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
 new file mode 100644
 index 000..8b0d0ff
 --- /dev/null
 +++ b/hw/qdev-properties.c
 @@ -0,0 +1,246 @@
 
 Gerd, could you clarify what's the copyright/license of this file? (I
 mean, at least the copyright/license of the initial version of the file
 you wrote, below).
 
 I am CCing all other authors that touched the file (according to git
 logs), so they can clarify what's the license they assumed for the file
 and their contributions.

gplv2+

cheers,
  Gerd




[Qemu-devel] [PATCH 02/15] target-ppc: Rework storage of VPA registration state

2012-10-18 Thread David Gibson
With PAPR guests, hypercalls allow registration of the Virtual Processor
Area (VPA), SLB shadow and dispatch trace log (DTL), each of which allow
for certain communication between the guest and hypervisor.  Currently, we
store the addresses of the three areas and the size of the dtl in
CPUPPCState.

The SLB shadow and DTL are variable sized, with the size being retrieved
from within the registered memory area at the hypercall time.  This size
can later be overwritten with other information, however, so we need to
save the size as of registration time.  We already do this for the DTL,
but not for the SLB shadow, so this patch fixes that.

In addition, we change the storage of the VPA information to use fixed
size integer types which will make life easier for syncing this data with
KVM, which we will need in future.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr_hcall.c|   26 ++
 target-ppc/cpu.h|7 +++
 target-ppc/translate_init.c |7 ---
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 762493a..621dabd 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -366,26 +366,26 @@ static target_ulong register_vpa(CPUPPCState *env, 
target_ulong vpa)
 return H_PARAMETER;
 }
 
-env-vpa = vpa;
+env-vpa_addr = vpa;
 
-tmp = ldub_phys(env-vpa + VPA_SHARED_PROC_OFFSET);
+tmp = ldub_phys(env-vpa_addr + VPA_SHARED_PROC_OFFSET);
 tmp |= VPA_SHARED_PROC_VAL;
-stb_phys(env-vpa + VPA_SHARED_PROC_OFFSET, tmp);
+stb_phys(env-vpa_addr + VPA_SHARED_PROC_OFFSET, tmp);
 
 return H_SUCCESS;
 }
 
 static target_ulong deregister_vpa(CPUPPCState *env, target_ulong vpa)
 {
-if (env-slb_shadow) {
+if (env-slb_shadow_addr) {
 return H_RESOURCE;
 }
 
-if (env-dispatch_trace_log) {
+if (env-dtl_addr) {
 return H_RESOURCE;
 }
 
-env-vpa = 0;
+env-vpa_addr = 0;
 return H_SUCCESS;
 }
 
@@ -407,18 +407,20 @@ static target_ulong register_slb_shadow(CPUPPCState *env, 
target_ulong addr)
 return H_PARAMETER;
 }
 
-if (!env-vpa) {
+if (!env-vpa_addr) {
 return H_RESOURCE;
 }
 
-env-slb_shadow = addr;
+env-slb_shadow_addr = addr;
+env-slb_shadow_size = size;
 
 return H_SUCCESS;
 }
 
 static target_ulong deregister_slb_shadow(CPUPPCState *env, target_ulong addr)
 {
-env-slb_shadow = 0;
+env-slb_shadow_addr = 0;
+env-slb_shadow_size = 0;
 return H_SUCCESS;
 }
 
@@ -437,11 +439,11 @@ static target_ulong register_dtl(CPUPPCState *env, 
target_ulong addr)
 return H_PARAMETER;
 }
 
-if (!env-vpa) {
+if (!env-vpa_addr) {
 return H_RESOURCE;
 }
 
-env-dispatch_trace_log = addr;
+env-dtl_addr = addr;
 env-dtl_size = size;
 
 return H_SUCCESS;
@@ -449,7 +451,7 @@ static target_ulong register_dtl(CPUPPCState *env, 
target_ulong addr)
 
 static target_ulong deregister_dtl(CPUPPCState *env, target_ulong addr)
 {
-env-dispatch_trace_log = 0;
+env-dtl_addr = 0;
 env-dtl_size = 0;
 
 return H_SUCCESS;
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index faf4404..eb2f43d 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -1045,10 +1045,9 @@ struct CPUPPCState {
 #endif
 
 #if defined(TARGET_PPC64)  !defined(CONFIG_USER_ONLY)
-target_phys_addr_t vpa;
-target_phys_addr_t slb_shadow;
-target_phys_addr_t dispatch_trace_log;
-uint32_t dtl_size;
+uint64_t vpa_addr;
+uint64_t slb_shadow_addr, slb_shadow_size;
+uint64_t dtl_addr, dtl_size;
 #endif /* TARGET_PPC64 */
 
 int error_code;
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index a972287..831b169 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -10425,9 +10425,10 @@ static void ppc_cpu_reset(CPUState *s)
 env-error_code = 0;
 
 #if defined(TARGET_PPC64)  !defined(CONFIG_USER_ONLY)
-env-vpa = 0;
-env-slb_shadow = 0;
-env-dispatch_trace_log = 0;
+env-vpa_addr = 0;
+env-slb_shadow_addr = 0;
+env-slb_shadow_size = 0;
+env-dtl_addr = 0;
 env-dtl_size = 0;
 #endif /* TARGET_PPC64 */
 
-- 
1.7.10.4




Re: [Qemu-devel] [PATCH 1/6] qdev: rework device properties.

2012-10-18 Thread Christian Borntraeger
On 17/10/12 22:14, Eduardo Habkost wrote:
 On Wed, Jul 15, 2009 at 01:43:31PM +0200, Gerd Hoffmann wrote:
 [...]
 diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
 new file mode 100644
 index 000..8b0d0ff
 --- /dev/null
 +++ b/hw/qdev-properties.c
 @@ -0,0 +1,246 @@
 
 Gerd, could you clarify what's the copyright/license of this file? (I
 mean, at least the copyright/license of the initial version of the file
 you wrote, below).
 
 I am CCing all other authors that touched the file (according to git
 logs), so they can clarify what's the license they assumed for the file
 and their contributions.

I assumed GPLv2+.





[Qemu-devel] [PATCH 06/15] pseries: Use #define for XICS base irq number

2012-10-18 Thread David Gibson
From: Ben Herrenschmidt b...@kernel.crashing.org

Currently the lowest real irq number for the XICS irq controller (as
opposed to numbers reserved for IPIs and other special purposes) is
hard coded as 16 in two places - in xics_system_init() and in spapr.c.

As well as being generally bad practice, we're going to need to change this
number soon to fit in with the in-kernel XICS implementation.  This patch
adds a #define for this number to avoid future breakage.

Signed-off-by: Michael Ellerman mich...@ellerman.id.au
Signed-off-by: Ben Herrenschmidt b...@kernel.crashing.org
Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr.c |2 +-
 hw/xics.c  |2 +-
 hw/xics.h  |1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/spapr.c b/hw/spapr.c
index 64c35a8..789c941 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -798,7 +798,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 
 /* Set up Interrupt Controller */
 spapr-icp = xics_system_init(XICS_IRQS);
-spapr-next_irq = 16;
+spapr-next_irq = XICS_IRQ_BASE;
 
 /* Set up EPOW events infrastructure */
 spapr_events_init(spapr);
diff --git a/hw/xics.c b/hw/xics.c
index 7a899dd..db01fe3 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -548,7 +548,7 @@ struct icp_state *xics_system_init(int nr_irqs)
 
 ics = g_malloc0(sizeof(*ics));
 ics-nr_irqs = nr_irqs;
-ics-offset = 16;
+ics-offset = XICS_IRQ_BASE;
 ics-irqs = g_malloc0(nr_irqs * sizeof(struct ics_irq_state));
 
 icp-ics = ics;
diff --git a/hw/xics.h b/hw/xics.h
index 6817268..c3bf008 100644
--- a/hw/xics.h
+++ b/hw/xics.h
@@ -28,6 +28,7 @@
 #define __XICS_H__
 
 #define XICS_IPI0x2
+#define XICS_IRQ_BASE   0x10
 
 struct icp_state;
 
-- 
1.7.10.4




[Qemu-devel] [PATCH 05/15] pseries: Clean up inconsistent variable name in xics.c

2012-10-18 Thread David Gibson
Throughout xics.c 'nr' is used to refer to a global interrupt number, and
'server' is used to refer to an interrupt server number (i.e. CPU number).
Except in icp_set_mfrr(), where 'nr' is used as a server number.  Fix this
confusing inconsistency.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/xics.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/xics.c b/hw/xics.c
index ce88aa7..7a899dd 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -108,13 +108,13 @@ static void icp_set_cppr(struct icp_state *icp, int 
server, uint8_t cppr)
 }
 }
 
-static void icp_set_mfrr(struct icp_state *icp, int nr, uint8_t mfrr)
+static void icp_set_mfrr(struct icp_state *icp, int server, uint8_t mfrr)
 {
-struct icp_server_state *ss = icp-ss + nr;
+struct icp_server_state *ss = icp-ss + server;
 
 ss-mfrr = mfrr;
 if (mfrr  CPPR(ss)) {
-icp_check_ipi(icp, nr);
+icp_check_ipi(icp, server);
 }
 }
 
-- 
1.7.10.4




Re: [Qemu-devel] Disabling KVM on the fly

2012-10-18 Thread Paolo Bonzini
Il 17/10/2012 20:37, Jan Kiszka ha scritto:
 On 2012-10-17 18:44, Paolo Bonzini wrote:
 Il 17/10/2012 18:37, Clemens Kolbitsch ha scritto:
 Guys,

 I know this is question might seem a bit odd, but I'm curious:

 Has anyone ever tried to write code to disable KVM on the fly / is it
 at all possible? I have a situation where I need to use TCG for
 certain parts of the code, but would love to have acceleration for
 everything else. My idea was to pause the VM, then use the
 snapshotting mechanism to dump the state, and then to resume the
 snapshot, but writing the KVM state into the non-KVM structures.

 As a start, you can try using migrate exec:catfoo.save with a KVM
 machine and -incoming 'exec:cat foo.save' with a TCG machine.  The
 main problem should be that TCG doesn't implement kvmclock.

 If you disable the KVM interrupt controller and timer (which is just an
 implementation detail, not a hardware difference),
 
 Unnecessary. Both models (KVM in-kernel and QEMU userspace) are
 compatible - in the absence of bugs.

He wants to really switch it on the fly---not just migrate out and
in---and for that you need to disable the KVM-specific devices.

 But loading a KVM image into TCG lets non-trival guests lock up. Likely
 due to differences in the CPU virtualization/emulation (MSRs...).

Perhaps that can be mitigated by using an older machine model.  Start
with something simple like a pentium2 and work up from there...

Paolo



Re: [Qemu-devel] [PATCH v2] pci: Return PCI_INTX_DISABLED when no bus INTx routing support

2012-10-18 Thread Michael S. Tsirkin
On Wed, Oct 17, 2012 at 04:13:12PM -0600, Alex Williamson wrote:
 Rather than assert, simply return PCI_INTX_DISABLED when we don't
 have a pci_route_irq_fn.  PIIX already returns DISABLED for an
 invalid pin, so users already deal with this state.  Users of this
 interface should only be acting on an ENABLED or INVERTED return
 value (though we really have no support for INVERTED).  Also
 complain loudly when we hit this so we don't forget it's missing.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---

Thanks, applied.

 v2: Turn up the annoyance factor for hitting this
 
  hw/pci.c |8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/hw/pci.c b/hw/pci.c
 index 83d262a..6a66b32 100644
 --- a/hw/pci.c
 +++ b/hw/pci.c
 @@ -1094,7 +1094,13 @@ PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice 
 *dev, int pin)
   pin = bus-map_irq(dev, pin);
   dev = bus-parent_dev;
  } while (dev);
 -assert(bus-route_intx_to_irq);
 +
 +if (!bus-route_intx_to_irq) {
 +error_report(PCI: Bug - unimplemented PCI INTx routing (%s)\n,
 + object_get_typename(OBJECT(bus-qbus.parent)));
 +return (PCIINTxRoute) { PCI_INTX_DISABLED, -1 };
 +}
 +
  return bus-route_intx_to_irq(bus-irq_opaque, pin);
  }
  



Re: [Qemu-devel] nvram and boot order

2012-10-18 Thread Alexander Graf


On 18.10.2012, at 03:18, Benjamin Herrenschmidt b...@kernel.crashing.org 
wrote:

 On Thu, 2012-10-18 at 11:09 +1100, David Gibson wrote:
 
 That's horrible; if you use -boot just once it will clobber a
 persistent NVRAM's boot order.  I see that a means of changing the
 default boot order from management tools is desirable, but that
 shouldn't be the normal behaviour of -boot.  And the objections to (2)
 apply even more strongly - we'd need to translate arbitrary -boot
 strings to NVRAM representation which may not be at all
 straightforward from the information qemu has available.
 
 It may not be straight forward, but it's what makes the most sense from
 a user's PoV.
 
 Bollocks.  Using -boot to override the normal boot sequence
 permanently changing the normal boot sequence absoultely does not make
 sense from a user's PoV.
 
 I strongly agree with David here. -boot should not change the persistent
 state.

I think Anthony and you are looking at 2 different use cases, each with their 
own sane reasoning.

You want to have the chance to override the boot order temporarily for things 
like cd boot or quick guest rescue missions.

You also want to be able to permanently change the guest's boot order from a 
management tool. At that same place you want to be able to display it, so you 
don't have to boot your vm to know what it would be doing.

As for device detection logic, both face the same problems. You need to be able 
to say 'boot from cd-rom first temporarily' just the same as you need to be 
able to say 'boot from the first cd-rom as first boot option permanently'. The 
permanent change needs to be possible with the vm turned off though.

I suppose that Anthony's reasoning is that we can implement temporary in the 
management layer (or even qemu) if we have the permanent mechanism, by 
switching back to the previous state after shutdown if the guest written boot 
order didn't change.

I don't mind personally if we have one interface for temporary and persistent 
or 2 separate ones, but I think we should aim for having both options available 
in the long run. Though doing permanent changes first and reverting them later 
could raise problems when you kill your vm, since that wouldn't clean up the 
temporary change.

 
 In our case, the persistent state will have been carefully crafted by
 complicated scripts by the distro installer, and while I may want to use
 -boot to boot once off a cd image or similar, I certainly don't want
 that to affect my nvram setting pointing to the right on-disk
 bootloader.
 
 Additionally I don't want qemu to have to understand all the intricacies
 of expressing OFW boot path if we can avoid it.

Yes, the same problem as EFI for example is facing. The solution here is as 
simple as it gets: a new device name space. Instead of having a boot list entry 
saying 'boot from device x, part y, file z' you would get an entry saying 'boot 
from /qemu/disk0' and leave the rest to the firmware. The good thing about this 
approach is that it again is persistable and can be used in boot order lists. 
So you can directly translate -boot cd into /qemu/disk0,/qemu/cdrom. And if you 
screwed up your guest boot config, just put that order in by hand into 
permanent config.

 
 Qemu gives as much info as it can and let the firmware itself inside the
 guest figure things out.

Yes, that's the only chance we have really. Even for bootindex, which could for 
example get translated to /qemu/pci/0.10.0/disk0 which again would then get 
aliased to the actual disk device node behind pci device 0.10.0 (first disk) by 
SLOF.

 
 In fact, I don't want Qemu to know anything about our internal nvram
 format. This is a business between the guest FW and the guest OS. The
 only thing qemu is allowed to do is wipe it out if asked to do so :-)

It might be useful to use fdt in nvram to store the permanent boot order. That 
way QEMU / management tools have the chance to make persistent changes. 
Everyone around already understands fdt anyways :).

 
 Um.. as far as I can tell that's a point in favour of my position.  It
 makes it impossible for qemu to correctly describe boot sequences
 using these devices in the terms firmware uses internally.  On the
 other hand it certainly is possible for qemu to pass bootorder=cd
 (or whatever) to the firmware via device tree of fw_cfg and have
 firmware locally interpret that in tersm of what it knows about
 available devices.
 
 This is more/less what happens with -boot today. IE. If you pass c
 SLOF looks for a bootable disk (though arguably the algorithm could be
 improved), d for a bootable optical media etc...
 
 We definitely want something a bit more expressive and in some case
 might even be able to pass down from the command line a full path to an
 actual device but we don't necessarily want qemu to understand the nvram
 format of this.
 
 Make it an expressive representation that makes sense to qemu, and let
 the FW translate that to something it 

Re: [Qemu-devel] [PATCH 1/6] qdev: rework device properties.

2012-10-18 Thread Jan Kiszka
On 2012-10-17 22:14, Eduardo Habkost wrote:
 On Wed, Jul 15, 2009 at 01:43:31PM +0200, Gerd Hoffmann wrote:
 [...]
 diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
 new file mode 100644
 index 000..8b0d0ff
 --- /dev/null
 +++ b/hw/qdev-properties.c
 @@ -0,0 +1,246 @@
 
 Gerd, could you clarify what's the copyright/license of this file? (I
 mean, at least the copyright/license of the initial version of the file
 you wrote, below).
 
 I am CCing all other authors that touched the file (according to git
 logs), so they can clarify what's the license they assumed for the file
 and their contributions.

Under the license the original author considered for this file. If that
is unclear, GPLv2.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux



Re: [Qemu-devel] [PATCH 13/15] target-pcc: Convert ppcemb_tlb_t to use fixed 64-bit RPN

2012-10-18 Thread Alexander Graf


On 18.10.2012, at 07:50, David Gibson da...@gibson.dropbear.id.au wrote:

 Currently the ppcemb_tlb_t struct, used on a number of embedded ppc models
 to represent a TLB entry contains a target_phys_addr_t.  That works
 reasonably for now, but is troublesome for saving the state, which we'll
 want to do in future.  target_phys_addr_t is a large enough type to contain
 a physical address for any supported machine - and can thus, in theory at
 least, vary depending on what machines are enabled other than the one
 we're actually using right now.  This makes it unsuitable for describing
 in vmstate.

Target_phys_addr_t is actually 64bit for all ppc targets today since some 32 
bit boards support more than 32 bit address space ;).

The change still is fine though, as it makes that bit explicit.

Alex

 
 This patch therefore changes ppcemb_tlb_t to use a fixed 64-bit integer
 which we know is sufficient for all the machines which use this structure.
 
 Signed-off-by: David Gibson da...@gibson.dropbear.id.au
 ---
 target-ppc/cpu.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
 index cde6da0..f30e0c7 100644
 --- a/target-ppc/cpu.h
 +++ b/target-ppc/cpu.h
 @@ -355,7 +355,7 @@ struct ppc6xx_tlb_t {
 
 typedef struct ppcemb_tlb_t ppcemb_tlb_t;
 struct ppcemb_tlb_t {
 -target_phys_addr_t RPN;
 +uint64_t RPN;
 target_ulong EPN;
 target_ulong PID;
 target_ulong size;
 -- 
 1.7.10.4
 



Re: [Qemu-devel] hdd and ssd passthrough

2012-10-18 Thread Paolo Bonzini
Il 17/10/2012 23:20, Mario Giammarco ha scritto:
 Hello,
 I hope I am in the right list.
 I would like to pass a real disk to a guest (freenas) in kvm/qemu without
 iommu/vt-d.
 
 I need guest to be able to:
 
 - use hdd smart;
 - configure hdd params like power saving;
 - understand real make and model of hdd;
 - understand when the hdd is an ssd.

This is really a libvirt question more than QEMU... anyway if your guest
is new enough (Linux 3.4 or RHEL/CentOS 6.3) you can probably use
virtio-scsi with XML that looks like this (using virsh edit):

controller type='scsi' model='virtio-scsi'/
disk type='block' device='lun'
  driver name='qemu' type='raw' cache='none'/
  source dev='/dev/sdb'/
  target dev='sda' bus='scsi'/
/disk

For passthrough you need the device='lun' setting.

You can also use virsh from the command line to achieve the same.
Write the controller and disk elements to a new file, like hba.xml
and sdb.xml.  Then:

# virsh attach-device --persistent Guest1 ~/hba.xml
# virsh attach-device --persistent Guest1 ~/sdb.xml

virt-manager and virt-install don't yet fully support virtio-scsi.

Paolo




[Qemu-devel] [Bug 1065325] Re: qemu-system-arm hangs on SIGUSR1 on OS X 10.8.2

2012-10-18 Thread Paolo Bonzini
This seems like an OS bug (kernel or Cocoa). The usage of the signal is
common to all QEMU system emulation targets, so it's not dependent on
Stellaris.

Try an older version of OS X, or try a different video backend (e.g.
VNC). That may help isolating the problem.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1065325

Title:
  qemu-system-arm hangs on SIGUSR1 on OS X 10.8.2

Status in QEMU:
  New

Bug description:
  I built the latest version of QEMU

  commit b4ae3cfa57b8c1bdbbd7b7d420971e9171203ade
  Date:   Mon Oct 1 12:34:37 2012 +1000

  My system is:
  Darwin localhost 12.2.0 Darwin Kernel Version 12.2.0: Sat Aug 25 00:48:52 PDT 
2012; root:xnu-2050.18.24~1/RELEASE_X86_64 x86_64

  localhost:qemu oliverks$ gcc -v
  Using built-in specs.
  Target: i686-apple-darwin11
  Configured with: 
/private/var/tmp/llvmgcc42/llvmgcc42-2336.11~28/src/configure 
--disable-checking --enable-werror 
--prefix=/Applications/Xcode.app/Contents/Developer/usr/llvm-gcc-4.2 
--mandir=/share/man --enable-languages=c,objc,c++,obj-c++ 
--program-prefix=llvm- --program-transform-name=/^[cg][^.-]*$/s/$/-4.2/ 
--with-slibdir=/usr/lib --build=i686-apple-darwin11 
--enable-llvm=/private/var/tmp/llvmgcc42/llvmgcc42-2336.11~28/dst-llvmCore/Developer/usr/local
 --program-prefix=i686-apple-darwin11- --host=x86_64-apple-darwin11 
--target=i686-apple-darwin11 --with-gxx-include-dir=/usr/include/c++/4.2.1
  Thread model: posix
  gcc version 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)

  Shortly after start up I freeze.  I am running the command line

  ./arm-softmmu/qemu-system-arm -M lm3s811evb -kernel
  ../FreeRTOSV7.2.0/FreeRTOS/Demo/CORTEX_LM3S811_GCC/gcc/RTOSDemo.axf

  The hang appears to occur due to this signal being sent

  static void qemu_tcg_init_cpu_signals(void)
  {
  sigset_t set;
  struct sigaction sigact;

  memset(sigact, 0, sizeof(sigact));
  sigact.sa_handler = cpu_signal;
  sigaction(SIG_IPI, sigact, NULL); // -- Signal that hangs system

  sigemptyset(set);
  sigaddset(set, SIG_IPI);
  pthread_sigmask(SIG_UNBLOCK, set, NULL);
  }

  Oliver

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1065325/+subscriptions



[Qemu-devel] [0/15] pseries patch queue

2012-10-18 Thread David Gibson
Hi Alex,

Here's the current state of the pseries pending patch queue.  Some of
these I think you already have, but I've lost track of what's in your
tree but not in mainline yet.

The biggest thing is the PAPR NVRAM implementation, and the SLOF
update to use it.  This doesn't yet implement all the boot order stuff
that's been discussed - what it implements is no worse than what we
have now, and there will be further patches coming to bring us towards
the desired behaviour.

This series also has what I hope is finally a correct version of the
FPU state extension.  Most of the rest is cleanups to the XICS code in
preparation for supporting the in-kernel XICS implementation.




[Qemu-devel] [PATCH 08/15] pseries: Move XICS initialization before cpu initialization

2012-10-18 Thread David Gibson
From: Ben Herrenschmidt b...@kernel.crashing.org

Currently, the pseries machine initializes the cpus, then the XICS
interrupt controller.  However, to support the upcoming in-kernel XICS
implementation we will need to initialize the irq controller before the
vcpus.  This patch makes the necesssary rearrangement.  This means the
xics init code can no longer auto-detect the number of cpus (interrupt
servers in XICS terminology) and so we must pass that in explicitly from
the platform code.

Signed-off-by: Michael Ellerman mich...@ellerman.id.au
Signed-off-by: Ben Herrenschmidt b...@kernel.crashing.org
Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr.c |   11 +++
 hw/xics.c  |   52 +++-
 hw/xics.h  |3 ++-
 3 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/hw/spapr.c b/hw/spapr.c
index 789c941..e8dbd97 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -744,6 +744,11 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 spapr-htab_shift++;
 }
 
+/* Set up Interrupt Controller before we create the VCPUs */
+spapr-icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / 
smp_threads,
+  XICS_IRQS);
+spapr-next_irq = XICS_IRQ_BASE;
+
 /* init CPUs */
 if (cpu_model == NULL) {
 cpu_model = kvm_enabled() ? host : POWER7;
@@ -756,6 +761,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 }
 env = cpu-env;
 
+xics_cpu_setup(spapr-icp, env);
+
 /* Set time-base frequency to 512 MHz */
 cpu_ppc_tb_init(env, TIMEBASE_FREQ);
 
@@ -796,10 +803,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
 g_free(filename);
 
 
-/* Set up Interrupt Controller */
-spapr-icp = xics_system_init(XICS_IRQS);
-spapr-next_irq = XICS_IRQ_BASE;
-
 /* Set up EPOW events infrastructure */
 spapr_events_init(spapr);
 
diff --git a/hw/xics.c b/hw/xics.c
index 6b08430..8860355 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -507,42 +507,36 @@ static void xics_reset(void *opaque)
 }
 }
 
-struct icp_state *xics_system_init(int nr_irqs)
+void xics_cpu_setup(struct icp_state *icp, CPUPPCState *env)
 {
-CPUPPCState *env;
-int max_server_num;
-struct icp_state *icp;
-struct ics_state *ics;
+struct icp_server_state *ss = icp-ss[env-cpu_index];
 
-max_server_num = -1;
-for (env = first_cpu; env != NULL; env = env-next_cpu) {
-if (env-cpu_index  max_server_num) {
-max_server_num = env-cpu_index;
-}
-}
+assert(env-cpu_index  icp-nr_servers);
 
-icp = g_malloc0(sizeof(*icp));
-icp-nr_servers = max_server_num + 1;
-icp-ss = g_malloc0(icp-nr_servers*sizeof(struct icp_server_state));
+switch (PPC_INPUT(env)) {
+case PPC_FLAGS_INPUT_POWER7:
+ss-output = env-irq_inputs[POWER7_INPUT_INT];
+break;
 
-for (env = first_cpu; env != NULL; env = env-next_cpu) {
-struct icp_server_state *ss = icp-ss[env-cpu_index];
+case PPC_FLAGS_INPUT_970:
+ss-output = env-irq_inputs[PPC970_INPUT_INT];
+break;
 
-switch (PPC_INPUT(env)) {
-case PPC_FLAGS_INPUT_POWER7:
-ss-output = env-irq_inputs[POWER7_INPUT_INT];
-break;
+default:
+fprintf(stderr, XICS interrupt controller does not support this CPU 
+bus model\n);
+abort();
+}
+}
 
-case PPC_FLAGS_INPUT_970:
-ss-output = env-irq_inputs[PPC970_INPUT_INT];
-break;
+struct icp_state *xics_system_init(int nr_servers, int nr_irqs)
+{
+struct icp_state *icp;
+struct ics_state *ics;
 
-default:
-hw_error(XICS interrupt model does not support this CPU bus 
- model\n);
-exit(1);
-}
-}
+icp = g_malloc0(sizeof(*icp));
+icp-nr_servers = nr_servers;
+icp-ss = g_malloc0(icp-nr_servers*sizeof(struct icp_server_state));
 
 ics = g_malloc0(sizeof(*ics));
 ics-nr_irqs = nr_irqs;
diff --git a/hw/xics.h b/hw/xics.h
index c3bf008..b43678a 100644
--- a/hw/xics.h
+++ b/hw/xics.h
@@ -35,6 +35,7 @@ struct icp_state;
 qemu_irq xics_get_qirq(struct icp_state *icp, int irq);
 void xics_set_irq_type(struct icp_state *icp, int irq, bool lsi);
 
-struct icp_state *xics_system_init(int nr_irqs);
+struct icp_state *xics_system_init(int nr_servers, int nr_irqs);
+void xics_cpu_setup(struct icp_state *icp, CPUPPCState *env);
 
 #endif /* __XICS_H__ */
-- 
1.7.10.4




[Qemu-devel] [PATCH 13/15] target-pcc: Convert ppcemb_tlb_t to use fixed 64-bit RPN

2012-10-18 Thread David Gibson
Currently the ppcemb_tlb_t struct, used on a number of embedded ppc models
to represent a TLB entry contains a target_phys_addr_t.  That works
reasonably for now, but is troublesome for saving the state, which we'll
want to do in future.  target_phys_addr_t is a large enough type to contain
a physical address for any supported machine - and can thus, in theory at
least, vary depending on what machines are enabled other than the one
we're actually using right now.  This makes it unsuitable for describing
in vmstate.

This patch therefore changes ppcemb_tlb_t to use a fixed 64-bit integer
which we know is sufficient for all the machines which use this structure.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 target-ppc/cpu.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index cde6da0..f30e0c7 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -355,7 +355,7 @@ struct ppc6xx_tlb_t {
 
 typedef struct ppcemb_tlb_t ppcemb_tlb_t;
 struct ppcemb_tlb_t {
-target_phys_addr_t RPN;
+uint64_t RPN;
 target_ulong EPN;
 target_ulong PID;
 target_ulong size;
-- 
1.7.10.4




[Qemu-devel] [PATCH 12/15] pseries: Split xics irq configuration from state information

2012-10-18 Thread David Gibson
Currently the XICS irq controller code has a per-irq state structure which
amongst other things includes whether the interrupt is level or message
triggered - this is configured by the platform code, and is not directly
visible to the guest.  This leads to a slightly awkward construct at reset
time where we need to reset everything in the state structure _except_ the
lsi/msi flag, which needs to retain the information given at platform init
time.

More importantly this flag will make matching the qemu state to the KVM
state for the upcoming in-kernel XICS implementation more awkward.  This
patch, therefore, removes this flag from the per-irq state structure,
instead adding a parallel array giving the lsi/msi configuration per irq.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/xics.c |   20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/hw/xics.c b/hw/xics.c
index 403afdb..5e20f0f 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -179,13 +179,13 @@ struct ics_irq_state {
 #define XICS_STATUS_REJECTED   0x4
 #define XICS_STATUS_MASKED_PENDING 0x8
 uint8_t status;
-bool lsi;
 };
 
 struct ics_state {
 int nr_irqs;
 int offset;
 qemu_irq *qirqs;
+bool *islsi;
 struct ics_irq_state *irqs;
 struct icp_state *icp;
 };
@@ -254,9 +254,8 @@ static void set_irq_lsi(struct ics_state *ics, int srcno, 
int val)
 static void ics_set_irq(void *opaque, int srcno, int val)
 {
 struct ics_state *ics = (struct ics_state *)opaque;
-struct ics_irq_state *irq = ics-irqs + srcno;
 
-if (irq-lsi) {
+if (ics-islsi[srcno]) {
 set_irq_lsi(ics, srcno, val);
 } else {
 set_irq_msi(ics, srcno, val);
@@ -293,7 +292,7 @@ static void ics_write_xive(struct ics_state *ics, int nr, 
int server,
 
 trace_xics_ics_write_xive(nr, srcno, server, priority);
 
-if (irq-lsi) {
+if (ics-islsi[srcno]) {
 write_xive_lsi(ics, srcno);
 } else {
 write_xive_msi(ics, srcno);
@@ -314,10 +313,8 @@ static void ics_resend(struct ics_state *ics)
 int i;
 
 for (i = 0; i  ics-nr_irqs; i++) {
-struct ics_irq_state *irq = ics-irqs + i;
-
 /* FIXME: filter by server#? */
-if (irq-lsi) {
+if (ics-islsi[i]) {
 resend_lsi(ics, i);
 } else {
 resend_msi(ics, i);
@@ -332,7 +329,7 @@ static void ics_eoi(struct ics_state *ics, int nr)
 
 trace_xics_ics_eoi(nr);
 
-if (irq-lsi) {
+if (ics-islsi[srcno]) {
 irq-status = ~XICS_STATUS_SENT;
 }
 }
@@ -354,7 +351,7 @@ void xics_set_irq_type(struct icp_state *icp, int irq, bool 
lsi)
 {
 assert(ics_valid_irq(icp-ics, irq));
 
-icp-ics-irqs[irq - icp-ics-offset].lsi = lsi;
+icp-ics-islsi[irq - icp-ics-offset] = lsi;
 }
 
 static target_ulong h_cppr(CPUPPCState *env, sPAPREnvironment *spapr,
@@ -515,10 +512,8 @@ static void xics_reset(void *opaque)
 qemu_set_irq(icp-ss[i].output, 0);
 }
 
+memset(ics-irqs, 0, sizeof(struct ics_irq_state) * ics-nr_irqs);
 for (i = 0; i  ics-nr_irqs; i++) {
-/* Reset everything *except* the type */
-ics-irqs[i].server = 0;
-ics-irqs[i].status = 0;
 ics-irqs[i].priority = 0xff;
 ics-irqs[i].saved_priority = 0xff;
 }
@@ -559,6 +554,7 @@ struct icp_state *xics_system_init(int nr_servers, int 
nr_irqs)
 ics-nr_irqs = nr_irqs;
 ics-offset = XICS_IRQ_BASE;
 ics-irqs = g_malloc0(nr_irqs * sizeof(struct ics_irq_state));
+ics-islsi = g_malloc0(nr_irqs * sizeof(bool));
 
 icp-ics = ics;
 ics-icp = icp;
-- 
1.7.10.4




[Qemu-devel] [PATCH 07/15] pseries: Cleanup duplications of ics_valid_irq() code

2012-10-18 Thread David Gibson
A couple of places in xics.c open-coded the same logic as is already
implemented in ics_valid_irq().  This patch fixes the code duplication.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/xics.c |6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/xics.c b/hw/xics.c
index db01fe3..6b08430 100644
--- a/hw/xics.c
+++ b/hw/xics.c
@@ -326,8 +326,7 @@ static void ics_eoi(struct ics_state *ics, int nr)
 
 qemu_irq xics_get_qirq(struct icp_state *icp, int irq)
 {
-if ((irq  icp-ics-offset)
-|| (irq = (icp-ics-offset + icp-ics-nr_irqs))) {
+if (!ics_valid_irq(icp-ics, irq)) {
 return NULL;
 }
 
@@ -336,8 +335,7 @@ qemu_irq xics_get_qirq(struct icp_state *icp, int irq)
 
 void xics_set_irq_type(struct icp_state *icp, int irq, bool lsi)
 {
-assert((irq = icp-ics-offset)
-(irq  (icp-ics-offset + icp-ics-nr_irqs)));
+assert(ics_valid_irq(icp-ics, irq));
 
 icp-ics-irqs[irq - icp-ics-offset].lsi = lsi;
 }
-- 
1.7.10.4




[Qemu-devel] [PATCH 10/15] pseries: Allow RTAS tokens without a qemu handler

2012-10-18 Thread David Gibson
From: Ben Herrenschmidt b...@kernel.crashing.org

Kernel-based RTAS calls will not have a qemu handler, but will
still be registered in qemu in order to be assigned a token
number and appear in the device-tree.

Let's test for the name being NULL rather than the handler
when deciding to skip an entry while building the device-tree

Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr_rtas.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/spapr_rtas.c b/hw/spapr_rtas.c
index 38c105f..72cad53 100644
--- a/hw/spapr_rtas.c
+++ b/hw/spapr_rtas.c
@@ -298,7 +298,7 @@ int spapr_rtas_device_tree_setup(void *fdt, 
target_phys_addr_t rtas_addr,
 for (i = 0; i  TOKEN_MAX; i++) {
 struct rtas_call *call = rtas_table[i];
 
-if (!call-fn) {
+if (!call-name) {
 continue;
 }
 
-- 
1.7.10.4




[Qemu-devel] [PATCH 04/15] target-ppc: Extend FPU state for newer POWER CPUs

2012-10-18 Thread David Gibson
This patch adds some extra FPU state to CPUPPCState.  Specifically,
fpscr is extended to a target_ulong bits, since some recent (64 bit)
CPUs now have more status bits than fit inside 32 bits.  Also, we add
the 32 VSR registers present on CPUs with VSX (these extend the
standard FP regs, which together with the Altivec/VMX registers form a
64 x 128bit register file for VSX).

We don't actually support the instructions using these extra registers
in TCG yet, but we still need a place to store the state so we can
sync it with KVM and savevm/loadvm it.  This patch updates the savevm
code to not fail on the extended state, but also does not actually
save it - that's a project for another patch.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---

v2:
 * Used target_ulong instead of uint64_t, since the extended state is used
only on ppc64 targets.
 * Fixed the TCG mapping of fpscr to match the new type.
---
 target-ppc/cpu.h   |4 +++-
 target-ppc/machine.c   |8 ++--
 target-ppc/translate.c |   29 ++---
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index eb2f43d..cde6da0 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -963,7 +963,7 @@ struct CPUPPCState {
 /* floating point registers */
 float64 fpr[32];
 /* floating point status and control register */
-uint32_t fpscr;
+target_ulong fpscr;
 
 /* Next instruction pointer */
 target_ulong nip;
@@ -1014,6 +1014,8 @@ struct CPUPPCState {
 /* Altivec registers */
 ppc_avr_t avr[32];
 uint32_t vscr;
+/* VSX registers */
+uint64_t vsr[32];
 /* SPE registers */
 uint64_t spe_acc;
 uint32_t spe_fscr;
diff --git a/target-ppc/machine.c b/target-ppc/machine.c
index 21ce757..5e7bc00 100644
--- a/target-ppc/machine.c
+++ b/target-ppc/machine.c
@@ -6,6 +6,7 @@ void cpu_save(QEMUFile *f, void *opaque)
 {
 CPUPPCState *env = (CPUPPCState *)opaque;
 unsigned int i, j;
+uint32_t fpscr;
 
 for (i = 0; i  32; i++)
 qemu_put_betls(f, env-gpr[i]);
@@ -30,7 +31,8 @@ void cpu_save(QEMUFile *f, void *opaque)
 u.d = env-fpr[i];
 qemu_put_be64(f, u.l);
 }
-qemu_put_be32s(f, env-fpscr);
+fpscr = env-fpscr;
+qemu_put_be32s(f, fpscr);
 qemu_put_sbe32s(f, env-access_type);
 #if defined(TARGET_PPC64)
 qemu_put_betls(f, env-asr);
@@ -90,6 +92,7 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id)
 CPUPPCState *env = (CPUPPCState *)opaque;
 unsigned int i, j;
 target_ulong sdr1;
+uint32_t fpscr;
 
 for (i = 0; i  32; i++)
 qemu_get_betls(f, env-gpr[i]);
@@ -114,7 +117,8 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id)
 u.l = qemu_get_be64(f);
 env-fpr[i] = u.d;
 }
-qemu_get_be32s(f, env-fpscr);
+qemu_get_be32s(f, fpscr);
+env-fpscr = fpscr;
 qemu_get_sbe32s(f, env-access_type);
 #if defined(TARGET_PPC64)
 qemu_get_betls(f, env-asr);
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 1042268..56725e6 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -68,7 +68,7 @@ static TCGv cpu_cfar;
 #endif
 static TCGv cpu_xer;
 static TCGv cpu_reserve;
-static TCGv_i32 cpu_fpscr;
+static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
 
 #include gen-icount.h
@@ -163,8 +163,8 @@ void ppc_translate_init(void)
  offsetof(CPUPPCState, reserve_addr),
  reserve_addr);
 
-cpu_fpscr = tcg_global_mem_new_i32(TCG_AREG0,
-   offsetof(CPUPPCState, fpscr), fpscr);
+cpu_fpscr = tcg_global_mem_new(TCG_AREG0,
+   offsetof(CPUPPCState, fpscr), fpscr);
 
 cpu_access_type = tcg_global_mem_new_i32(TCG_AREG0,
  offsetof(CPUPPCState, 
access_type), access_type);
@@ -2302,6 +2302,7 @@ GEN_FLOAT_B(neg, 0x08, 0x01, 0, PPC_FLOAT);
 /* mcrfs */
 static void gen_mcrfs(DisasContext *ctx)
 {
+TCGv tmp = tcg_temp_new();
 int bfa;
 
 if (unlikely(!ctx-fpu_enabled)) {
@@ -2309,9 +2310,11 @@ static void gen_mcrfs(DisasContext *ctx)
 return;
 }
 bfa = 4 * (7 - crfS(ctx-opcode));
-tcg_gen_shri_i32(cpu_crf[crfD(ctx-opcode)], cpu_fpscr, bfa);
+tcg_gen_shri_tl(tmp, cpu_fpscr, bfa);
+tcg_gen_trunc_tl_i32(cpu_crf[crfD(ctx-opcode)], tmp);
+tcg_temp_free(tmp);
 tcg_gen_andi_i32(cpu_crf[crfD(ctx-opcode)], cpu_crf[crfD(ctx-opcode)], 
0xf);
-tcg_gen_andi_i32(cpu_fpscr, cpu_fpscr, ~(0xF  bfa));
+tcg_gen_andi_tl(cpu_fpscr, cpu_fpscr, ~(0xF  bfa));
 }
 
 /* mffs */
@@ -2322,7 +2325,7 @@ static void gen_mffs(DisasContext *ctx)
 return;
 }
 gen_reset_fpstatus();
-tcg_gen_extu_i32_i64(cpu_fpr[rD(ctx-opcode)], cpu_fpscr);
+tcg_gen_extu_tl_i64(cpu_fpr[rD(ctx-opcode)], cpu_fpscr);
 gen_compute_fprf(cpu_fpr[rD(ctx-opcode)], 0, 

[Qemu-devel] [PATCH 09/15] pseries: Return the token when we register an RTAS call

2012-10-18 Thread David Gibson
From: Michael Ellerman mich...@ellerman.id.au

The kernel will soon be able to service some RTAS calls. However the
choice of tokens will still be up to userspace. To support this have
spapr_rtas_register() return the token that is allocated for an
RTAS call, that allows the calling code to tell the kernel what the
token value is.

Signed-off-by: Michael Ellerman mich...@ellerman.id.au
Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr.h  |2 +-
 hw/spapr_rtas.c |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/spapr.h b/hw/spapr.h
index 3d02911..89c70b5 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -323,7 +323,7 @@ static inline void rtas_st(target_ulong phys, int n, 
uint32_t val)
 typedef void (*spapr_rtas_fn)(sPAPREnvironment *spapr, uint32_t token,
   uint32_t nargs, target_ulong args,
   uint32_t nret, target_ulong rets);
-void spapr_rtas_register(const char *name, spapr_rtas_fn fn);
+int spapr_rtas_register(const char *name, spapr_rtas_fn fn);
 target_ulong spapr_rtas_call(sPAPREnvironment *spapr,
  uint32_t token, uint32_t nargs, target_ulong args,
  uint32_t nret, target_ulong rets);
diff --git a/hw/spapr_rtas.c b/hw/spapr_rtas.c
index b96078b..38c105f 100644
--- a/hw/spapr_rtas.c
+++ b/hw/spapr_rtas.c
@@ -239,7 +239,7 @@ target_ulong spapr_rtas_call(sPAPREnvironment *spapr,
 return H_PARAMETER;
 }
 
-void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
+int spapr_rtas_register(const char *name, spapr_rtas_fn fn)
 {
 int i;
 
@@ -255,7 +255,7 @@ void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
 rtas_next-name = name;
 rtas_next-fn = fn;
 
-rtas_next++;
+return (rtas_next++ - rtas_table) + TOKEN_BASE;
 }
 
 int spapr_rtas_device_tree_setup(void *fdt, target_phys_addr_t rtas_addr,
-- 
1.7.10.4




[Qemu-devel] [PATCH 01/15] pseries: Don't allow duplicate registration of hcalls or RTAS calls

2012-10-18 Thread David Gibson
Currently the pseries machine code allows a callback to be registered
for a hypercall number twice, as long as it's the same callback the second
time.  We don't test for duplicate registrations of RTAS callbacks at all
so it will effectively be last registratiojn wins.

This was originally done because it was awkward to ensure that the
registration happened exactly once, but the code has since been
restructured so that's no longer the case.

Duplicate registration of a hypercall or RTAS call could well suggest
a duplicate initialization which could cause other problems, so this patch
makes duplicate registrations a bug, to prevent the old behaviour from
hiding other bugs.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
---
 hw/spapr_hcall.c |3 +--
 hw/spapr_rtas.c  |9 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 194d9c2..762493a 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -670,11 +670,10 @@ void spapr_register_hypercall(target_ulong opcode, 
spapr_hcall_fn fn)
 } else {
 assert((opcode = KVMPPC_HCALL_BASE)  (opcode = KVMPPC_HCALL_MAX));
 
-
 slot = kvmppc_hypercall_table[opcode - KVMPPC_HCALL_BASE];
 }
 
-assert(!(*slot) || (fn == *slot));
+assert(!(*slot));
 *slot = fn;
 }
 
diff --git a/hw/spapr_rtas.c b/hw/spapr_rtas.c
index b808f80..b96078b 100644
--- a/hw/spapr_rtas.c
+++ b/hw/spapr_rtas.c
@@ -241,6 +241,15 @@ target_ulong spapr_rtas_call(sPAPREnvironment *spapr,
 
 void spapr_rtas_register(const char *name, spapr_rtas_fn fn)
 {
+int i;
+
+for (i = 0; i  (rtas_next - rtas_table); i++) {
+if (strcmp(name, rtas_table[i].name) == 0) {
+fprintf(stderr, RTAS call \%s\ registered twice\n, name);
+exit(1);
+}
+}
+
 assert(rtas_next  (rtas_table + TOKEN_MAX));
 
 rtas_next-name = name;
-- 
1.7.10.4




Re: [Qemu-devel] [PATCH v3] qemu-img: document 'info --backing-chain'

2012-10-18 Thread Stefan Hajnoczi
On Thu, Oct 18, 2012 at 11:25:34AM +0530, Kashyap Chamarthy wrote:
 Signed-off-by: Kashyap Chamarthy kashyap...@gmail.com
 ---
  qemu-img-cmds.hx |  4 ++--
  qemu-img.texi| 21 -
  2 files changed, 22 insertions(+), 3 deletions(-)

Reviewed-by: Stefan Hajnoczi stefa...@redhat.com



Re: [Qemu-devel] [Qemu-trivial] [PATCH] block: bdrv_create(): don't leak cco.filename on error

2012-10-18 Thread Stefan Hajnoczi
On Wed, Oct 17, 2012 at 04:45:25PM -0300, Luiz Capitulino wrote:
 
 Signed-off-by: Luiz Capitulino lcapitul...@redhat.com
 ---
  block.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

Reviewed-by: Stefan Hajnoczi stefa...@redhat.com



Re: [Qemu-devel] [PATCH 2/2] Add check_rxov into VMState.

2012-10-18 Thread Stefan Hajnoczi
On Wed, Oct 17, 2012 at 08:31:47PM +0200, Dmitry Fleytman wrote:
 @@ -1100,6 +1100,11 @@ static bool is_version_1(void *opaque, int version_id)
  return version_id == 1;
  }
  
 +static bool is_version_3(void *opaque, int version_id)
 +{
 +return version_id == 1;
 +}

version_id == 3?



[Qemu-devel] [PATCH 01/72] fix virtfs

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 fsdev/virtfs-proxy-helper.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fsdev/virtfs-proxy-helper.c b/fsdev/virtfs-proxy-helper.c
index f9a8270..771dc4e 100644
--- a/fsdev/virtfs-proxy-helper.c
+++ b/fsdev/virtfs-proxy-helper.c
@@ -282,6 +282,7 @@ static int send_status(int sockfd, struct iovec *iovec, int 
status)
  */
 static int setfsugid(int uid, int gid)
 {
+int ret;
 /*
  * We still need DAC_OVERRIDE because  we don't change
  * supplementary group ids, and hence may be subjected DAC rules
@@ -290,8 +291,14 @@ static int setfsugid(int uid, int gid)
 CAP_DAC_OVERRIDE,
 };

-setfsgid(gid);
-setfsuid(uid);
+ret = setfsgid(gid);
+if (ret  0) {
+return ret;
+}
+ret = setfsuid(uid);
+if (ret  0) {
+return ret;
+}

 if (uid != 0 || gid != 0) {
 return do_cap_set(cap_list, ARRAY_SIZE(cap_list), 0);
-- 
1.7.11.7




[Qemu-devel] [PATCH 13/30] savevm: New save live migration method: pending

2012-10-18 Thread Juan Quintela
Code just now does (simplified for clarity)

if (qemu_savevm_state_iterate(s-file) == 1) {
   vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
   qemu_savevm_state_complete(s-file);
}

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

pending_size = qemu_savevm_state_pending(s-file, max_size);
DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
if (pending_size = max_size) {
ret = qemu_savevm_state_iterate(s-file);
 } else {
vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
qemu_savevm_state_complete(s-file);
 }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).

Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c   | 48 ++--
 block-migration.c | 49 ++---
 buffered_file.c   | 25 ++---
 migration.c   | 22 +++---
 migration.h   |  2 +-
 savevm.c  | 19 +++
 sysemu.h  |  1 +
 vmstate.h |  1 +
 8 files changed, 83 insertions(+), 84 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 2d29828..1eefef8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -608,12 +608,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque)

 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
-uint64_t bytes_transferred_last;
-double bwidth = 0;
 int ret;
 int i;
-uint64_t expected_downtime;
-MigrationState *s = migrate_get_current();
+int64_t t0;

 qemu_mutex_lock_ramlist();

@@ -621,9 +618,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 reset_ram_globals();
 }

-bytes_transferred_last = bytes_transferred;
-bwidth = qemu_get_clock_ns(rt_clock);
-
+t0 = qemu_get_clock_ns(rt_clock);
 i = 0;
 while ((ret = qemu_file_rate_limit(f)) == 0) {
 int bytes_sent;
@@ -641,7 +636,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
iterations
 */
 if ((i  63) == 0) {
-uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 100;
+uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 100;
 if (t1  MAX_WAIT) {
 DPRINTF(big wait: % PRIu64  milliseconds, %d iterations\n,
 t1, i);
@@ -655,31 +650,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 return ret;
 }

-bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
-bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
-
-/* if we haven't transferred anything this round, force
- * expected_downtime to a very high value, but without
- * crashing */
-if (bwidth == 0) {
-bwidth = 0.01;
-}
-
 qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

-expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-DPRINTF(ram_save_live: expected(% PRIu64 ) = max( PRIu64 )?\n,
-expected_downtime, migrate_max_downtime());
-
-if (expected_downtime = migrate_max_downtime()) {
-migration_bitmap_sync();
-expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-s-expected_downtime = expected_downtime / 100; /* ns - ms */
-
-return expected_downtime = migrate_max_downtime();
-}
-return 0;
+return i;
 }

 static int ram_save_complete(QEMUFile *f, void *opaque)
@@ -712,6 +686,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 return 0;
 }

+static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+uint64_t remaining_size;
+
+remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+
+if (remaining_size  max_size) {
+migration_bitmap_sync();
+remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+}
+return remaining_size;
+}
+
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 {
 int ret, rc = 0;
@@ -897,6 +884,7 @@ SaveVMHandlers savevm_ram_handlers = {
 .save_live_setup = ram_save_setup,
 .save_live_iterate = ram_save_iterate,
 .save_live_complete = ram_save_complete,
+.save_live_pending = ram_save_pending,
 .load_state = ram_load,
 .cancel = ram_migration_cancel,
 };
diff --git a/block-migration.c b/block-migration.c
index 71b9601..5db01fe 100644
--- 

[Qemu-devel] [PATCH 23/30] migration: print times for end phase

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 block.c |  6 ++
 cpus.c  | 17 +
 migration.c |  9 +
 savevm.c| 13 +
 4 files changed, 45 insertions(+)

diff --git a/block.c b/block.c
index e95f613..9cd6ffe 100644
--- a/block.c
+++ b/block.c
@@ -2648,9 +2648,15 @@ int bdrv_get_flags(BlockDriverState *bs)
 void bdrv_flush_all(void)
 {
 BlockDriverState *bs;
+int64_t start_time, end_time;
+
+start_time = qemu_get_clock_ms(rt_clock);

 QTAILQ_FOREACH(bs, bdrv_states, list) {
 bdrv_flush(bs);
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time flush device %s: %ld\n, bs-filename,
+   end_time - start_time);
 }
 }

diff --git a/cpus.c b/cpus.c
index 191cbf5..63e51f1 100644
--- a/cpus.c
+++ b/cpus.c
@@ -435,14 +435,31 @@ int cpu_is_stopped(CPUArchState *env)

 static void do_vm_stop(RunState state)
 {
+int64_t start_time, end_time;
+
 if (runstate_is_running()) {
+start_time = qemu_get_clock_ms(rt_clock);
 cpu_disable_ticks();
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time cpu_disable_ticks %ld\n, end_time - start_time);
 pause_all_vcpus();
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time pause_all_vcpus %ld\n, end_time - start_time);
 runstate_set(state);
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time runstate_set %ld\n, end_time - start_time);
 vm_state_notify(0, state);
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time vmstate_notify %ld\n, end_time - start_time);
 bdrv_drain_all();
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time bdrv_drain_all %ld\n, end_time - start_time);
 bdrv_flush_all();
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time bdrv_flush_all %ld\n, end_time - start_time);
 monitor_protocol_event(QEVENT_STOP, NULL);
+end_time = qemu_get_clock_ms(rt_clock);
+printf(time monitor_protocol_event %ld\n, end_time - start_time);
 }
 }

diff --git a/migration.c b/migration.c
index e6ff1f1..3856a99 100644
--- a/migration.c
+++ b/migration.c
@@ -697,19 +697,26 @@ static void *buffered_file_thread(void *opaque)
 DPRINTF(done iterating\n);
 start_time = qemu_get_clock_ms(rt_clock);
 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+end_time = qemu_get_clock_ms(rt_clock);
+printf(wakeup_request %ld\n, end_time - start_time);
 if (old_vm_running) {
 vm_stop(RUN_STATE_FINISH_MIGRATE);
 } else {
 vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 }
+end_time = qemu_get_clock_ms(rt_clock);
+printf(vm_stop 2 %ld\n, end_time - start_time);
 ret = qemu_savevm_state_complete(m-file);
 if (ret  0) {
 qemu_mutex_unlock_iothread();
 break;
 } else {
+end_time = qemu_get_clock_ms(rt_clock);
+printf(complete without error 3a %ld\n, end_time - 
start_time);
 migrate_fd_completed(m);
 }
 end_time = qemu_get_clock_ms(rt_clock);
+printf(completed %ld\n, end_time - start_time);
 m-total_time = end_time - m-total_time;
 m-downtime = end_time - start_time;
 if (m-state != MIG_STATE_COMPLETED) {
@@ -717,6 +724,8 @@ static void *buffered_file_thread(void *opaque)
 vm_start();
 }
 }
+end_time = qemu_get_clock_ms(rt_clock);
+printf(end completed stage %ld\n, end_time - start_time);
 last_round = true;
 }
 }
diff --git a/savevm.c b/savevm.c
index a68c37b..3ebde5c 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1629,9 +1629,14 @@ int qemu_savevm_state_complete(QEMUFile *f)
 {
 SaveStateEntry *se;
 int ret;
+int64_t t1;
+int64_t t0 = qemu_get_clock_ms(rt_clock);

 cpu_synchronize_all_states();
+t1 = qemu_get_clock_ms(rt_clock);
+printf(synchronize_all_states %ld\n, t1 - t0);

+t0 = t1;
 QTAILQ_FOREACH(se, savevm_handlers, entry) {
 if (!se-ops || !se-ops-save_live_complete) {
 continue;
@@ -1652,6 +1657,11 @@ int qemu_savevm_state_complete(QEMUFile *f)
 return ret;
 }
 }
+t1 = qemu_get_clock_ms(rt_clock);
+
+printf(migrate RAM %ld\n, t1 - t0);
+
+t0 = t1;

 QTAILQ_FOREACH(se, savevm_handlers, entry) {
 int len;
@@ -1676,6 +1686,9 @@ int qemu_savevm_state_complete(QEMUFile *f)
 trace_savevm_section_end(se-section_id);
 }

+t1 = qemu_get_clock_ms(rt_clock);
+
+printf(migrate rest devices %ld\n, t1 - t0);
 qemu_put_byte(f, 

[Qemu-devel] [PATCH 22/30] migration: unfold rest of migrate_fd_put_ready() into thread

2012-10-18 Thread Juan Quintela
This will allow us finer control in next patches.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 95 ++---
 1 file changed, 41 insertions(+), 54 deletions(-)

diff --git a/migration.c b/migration.c
index 7206866..e6ff1f1 100644
--- a/migration.c
+++ b/migration.c
@@ -644,54 +644,6 @@ static int64_t buffered_get_rate_limit(void *opaque)
 return s-xfer_limit;
 }

-static bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
-{
-int ret;
-uint64_t pending_size;
-bool last_round = false;
-
-qemu_mutex_lock_iothread();
-DPRINTF(iterate\n);
-pending_size = qemu_savevm_state_pending(s-file, max_size);
-DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
-if (pending_size = max_size) {
-ret = qemu_savevm_state_iterate(s-file);
-if (ret  0) {
-migrate_fd_error(s);
-}
-} else {
-int old_vm_running = runstate_is_running();
-int64_t start_time, end_time;
-
-DPRINTF(done iterating\n);
-start_time = qemu_get_clock_ms(rt_clock);
-qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-if (old_vm_running) {
-vm_stop(RUN_STATE_FINISH_MIGRATE);
-} else {
-vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-}
-
-if (qemu_savevm_state_complete(s-file)  0) {
-migrate_fd_error(s);
-} else {
-migrate_fd_completed(s);
-}
-end_time = qemu_get_clock_ms(rt_clock);
-s-total_time = end_time - s-total_time;
-s-downtime = end_time - start_time;
-if (s-state != MIG_STATE_COMPLETED) {
-if (old_vm_running) {
-vm_start();
-}
-}
-last_round = true;
-}
-qemu_mutex_unlock_iothread();
-
-return last_round;
-}
-
 /* 100ms  xfer_limit is the limit that we should write each 100ms */
 #define BUFFER_DELAY 100

@@ -716,6 +668,7 @@ static void *buffered_file_thread(void *opaque)

 while (true) {
 int64_t current_time = qemu_get_clock_ms(rt_clock);
+uint64_t pending_size;

 qemu_mutex_lock_iothread();
 if (m-state != MIG_STATE_ACTIVE) {
@@ -727,6 +680,46 @@ static void *buffered_file_thread(void *opaque)
 qemu_mutex_unlock_iothread();
 break;
 }
+if (s-bytes_xfer  s-xfer_limit) {
+DPRINTF(iterate\n);
+pending_size = qemu_savevm_state_pending(m-file, max_size);
+DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
+if (pending_size = max_size) {
+ret = qemu_savevm_state_iterate(m-file);
+if (ret  0) {
+qemu_mutex_unlock_iothread();
+break;
+}
+} else {
+int old_vm_running = runstate_is_running();
+int64_t start_time, end_time;
+
+DPRINTF(done iterating\n);
+start_time = qemu_get_clock_ms(rt_clock);
+qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+if (old_vm_running) {
+vm_stop(RUN_STATE_FINISH_MIGRATE);
+} else {
+vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}
+ret = qemu_savevm_state_complete(m-file);
+if (ret  0) {
+qemu_mutex_unlock_iothread();
+break;
+} else {
+migrate_fd_completed(m);
+}
+end_time = qemu_get_clock_ms(rt_clock);
+m-total_time = end_time - m-total_time;
+m-downtime = end_time - start_time;
+if (m-state != MIG_STATE_COMPLETED) {
+if (old_vm_running) {
+vm_start();
+}
+}
+last_round = true;
+}
+}
 qemu_mutex_unlock_iothread();

 if (current_time = initial_time + BUFFER_DELAY) {
@@ -747,12 +740,6 @@ static void *buffered_file_thread(void *opaque)
 usleep((initial_time + BUFFER_DELAY - current_time)*1000);
 }
 buffered_flush(s);
-
-DPRINTF(file is ready\n);
-if (s-bytes_xfer  s-xfer_limit) {
-DPRINTF(notifying client\n);
-last_round = migrate_fd_put_ready(m, max_size);
-}
 }

 out:
-- 
1.7.11.7




[Qemu-devel] [PATCH 20/30] migration: move begining stage to the migration thread

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/migration.c b/migration.c
index 65f96b7..8cacbc3 100644
--- a/migration.c
+++ b/migration.c
@@ -647,7 +647,6 @@ static int64_t buffered_get_rate_limit(void *opaque)
 static bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
 {
 int ret;
-static bool first_time = true;
 uint64_t pending_size;
 bool last_round = false;

@@ -657,17 +656,6 @@ static bool migrate_fd_put_ready(MigrationState *s, 
uint64_t max_size)
 qemu_mutex_unlock_iothread();
 return false;
 }
-if (first_time) {
-first_time = false;
-DPRINTF(beginning savevm\n);
-ret = qemu_savevm_state_begin(s-file, s-params);
-if (ret  0) {
-DPRINTF(failed, %d\n, ret);
-migrate_fd_error(s);
-qemu_mutex_unlock_iothread();
-return false;
-}
-}

 DPRINTF(iterate\n);
 pending_size = qemu_savevm_state_pending(s-file, max_size);
@@ -716,9 +704,21 @@ static bool migrate_fd_put_ready(MigrationState *s, 
uint64_t max_size)
 static void *buffered_file_thread(void *opaque)
 {
 QEMUFileBuffered *s = opaque;
+MigrationState *m = s-migration_state;
 int64_t initial_time = qemu_get_clock_ms(rt_clock);
 int64_t max_size = 0;
 bool last_round = false;
+int ret;
+
+qemu_mutex_lock_iothread();
+DPRINTF(beginning savevm\n);
+ret = qemu_savevm_state_begin(m-file, m-params);
+if (ret  0) {
+DPRINTF(failed, %d\n, ret);
+qemu_mutex_unlock_iothread();
+goto out;
+}
+qemu_mutex_unlock_iothread();

 while (true) {
 int64_t current_time = qemu_get_clock_ms(rt_clock);
@@ -748,10 +748,14 @@ static void *buffered_file_thread(void *opaque)
 DPRINTF(file is ready\n);
 if (s-bytes_xfer  s-xfer_limit) {
 DPRINTF(notifying client\n);
-last_round = migrate_fd_put_ready(s-migration_state, max_size);
+last_round = migrate_fd_put_ready(m, max_size);
 }
 }

+out:
+if (ret  0) {
+migrate_fd_error(m);
+}
 g_free(s-buffer);
 g_free(s);
 return NULL;
-- 
1.7.11.7




[Qemu-devel] [PATCH 29/30] migration: Only go to the iterate stage if there is anything to send

2012-10-18 Thread Juan Quintela
Signed-off-by: Orit Wasserman owass...@redhat.com
Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration.c b/migration.c
index 3856a99..554d79a 100644
--- a/migration.c
+++ b/migration.c
@@ -684,7 +684,7 @@ static void *buffered_file_thread(void *opaque)
 DPRINTF(iterate\n);
 pending_size = qemu_savevm_state_pending(m-file, max_size);
 DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
-if (pending_size = max_size) {
+if (pending_size  pending_size = max_size) {
 ret = qemu_savevm_state_iterate(m-file);
 if (ret  0) {
 qemu_mutex_unlock_iothread();
-- 
1.7.11.7




Re: [Qemu-devel] [PATCHv2 1/2] signal: added a wrapper for sigprocmask function

2012-10-18 Thread Peter Maydell
On 17 October 2012 23:06, Alex Barcelo abarc...@ac.upc.edu wrote:
 On Wed, Oct 17, 2012 at 5:01 PM, Peter Maydell peter.mayd...@linaro.org 
 wrote:
 In my comments on v1 of this patch I wrote:
 I think all the uses of sigprocmask() in linux-user/signal.c also
 need to be do_sigprocmask(), as they are the guest trying to control
 its signal mask (via the mask it specifies for running signal handlers,
 or the mask it passes back when restoring context on return from a
 signal handler).

 I saw sigprocmask being used only inside sigreturn functions (hope I
 checked it correctly). I thought (maybe wrongly) that sigreturn should
 not be wrapped, just as internal sigprocmask calls are not proxyfied
 through do_sigprocmask.

 Sigreturn functions should not be called from guest directly, so they
 should not be a threat. And if some application uses it... well, then
 it is its fault, as POSIX does not guarantee any behavior (am I
 right?)

sigreturn functions operate on a structure passed in from the guest.
When the kernel enters a signal handler it stores the state of the
interrupted process on that process' stack before setting it up
to run the signal handler function. Return from the signal handler
involves the kernel (or in this case QEMU) restoring all that state.
Part of that state is the signal mask. Since the guest might have
changed that state, we must not trust it and so it goes through the
wrapper.

  I can't tell if it is worth doing it, or if there is any real case
 where qemu-user behavior is improving.

The point is consistency of design. Masks from the guest go through
the wrapper; masks used internally do not.

PS: if you disagree with a point in code review (and reviewers are
not always right!) it is better to send an email making the case
for why you disagree. If you just ignore it and send v2 patches
then you're forcing reviewers to hunt through your patch all
over again to check whether you paid attention the first time
round...

-- PMM



[Qemu-devel] [PATCH 17/30] migration: move migration_fd_put_ready()

2012-10-18 Thread Juan Quintela
Put it near its use and un-export it.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 132 ++--
 migration.h |   1 -
 2 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/migration.c b/migration.c
index b1567f3..a99653e 100644
--- a/migration.c
+++ b/migration.c
@@ -300,72 +300,6 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const 
void *data,
 return ret;
 }

-bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
-{
-int ret;
-static bool first_time = true;
-uint64_t pending_size;
-bool last_round = false;
-
-qemu_mutex_lock_iothread();
-if (s-state != MIG_STATE_ACTIVE) {
-DPRINTF(put_ready returning because of non-active state\n);
-qemu_mutex_unlock_iothread();
-return false;
-}
-if (first_time) {
-first_time = false;
-DPRINTF(beginning savevm\n);
-ret = qemu_savevm_state_begin(s-file, s-params);
-if (ret  0) {
-DPRINTF(failed, %d\n, ret);
-migrate_fd_error(s);
-qemu_mutex_unlock_iothread();
-return false;
-}
-}
-
-DPRINTF(iterate\n);
-pending_size = qemu_savevm_state_pending(s-file, max_size);
-DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
-if (pending_size = max_size) {
-ret = qemu_savevm_state_iterate(s-file);
-if (ret  0) {
-migrate_fd_error(s);
-}
-} else {
-int old_vm_running = runstate_is_running();
-int64_t start_time, end_time;
-
-DPRINTF(done iterating\n);
-start_time = qemu_get_clock_ms(rt_clock);
-qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-if (old_vm_running) {
-vm_stop(RUN_STATE_FINISH_MIGRATE);
-} else {
-vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-}
-
-if (qemu_savevm_state_complete(s-file)  0) {
-migrate_fd_error(s);
-} else {
-migrate_fd_completed(s);
-}
-end_time = qemu_get_clock_ms(rt_clock);
-s-total_time = end_time - s-total_time;
-s-downtime = end_time - start_time;
-if (s-state != MIG_STATE_COMPLETED) {
-if (old_vm_running) {
-vm_start();
-}
-}
-last_round = true;
-}
-qemu_mutex_unlock_iothread();
-
-return last_round;
-}
-
 static void migrate_fd_cancel(MigrationState *s)
 {
 if (s-state != MIG_STATE_ACTIVE)
@@ -718,6 +652,72 @@ static int64_t buffered_get_rate_limit(void *opaque)
 return s-xfer_limit;
 }

+static bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
+{
+int ret;
+static bool first_time = true;
+uint64_t pending_size;
+bool last_round = false;
+
+qemu_mutex_lock_iothread();
+if (s-state != MIG_STATE_ACTIVE) {
+DPRINTF(put_ready returning because of non-active state\n);
+qemu_mutex_unlock_iothread();
+return false;
+}
+if (first_time) {
+first_time = false;
+DPRINTF(beginning savevm\n);
+ret = qemu_savevm_state_begin(s-file, s-params);
+if (ret  0) {
+DPRINTF(failed, %d\n, ret);
+migrate_fd_error(s);
+qemu_mutex_unlock_iothread();
+return false;
+}
+}
+
+DPRINTF(iterate\n);
+pending_size = qemu_savevm_state_pending(s-file, max_size);
+DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
+if (pending_size = max_size) {
+ret = qemu_savevm_state_iterate(s-file);
+if (ret  0) {
+migrate_fd_error(s);
+}
+} else {
+int old_vm_running = runstate_is_running();
+int64_t start_time, end_time;
+
+DPRINTF(done iterating\n);
+start_time = qemu_get_clock_ms(rt_clock);
+qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+if (old_vm_running) {
+vm_stop(RUN_STATE_FINISH_MIGRATE);
+} else {
+vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}
+
+if (qemu_savevm_state_complete(s-file)  0) {
+migrate_fd_error(s);
+} else {
+migrate_fd_completed(s);
+}
+end_time = qemu_get_clock_ms(rt_clock);
+s-total_time = end_time - s-total_time;
+s-downtime = end_time - start_time;
+if (s-state != MIG_STATE_COMPLETED) {
+if (old_vm_running) {
+vm_start();
+}
+}
+last_round = true;
+}
+qemu_mutex_unlock_iothread();
+
+return last_round;
+}
+
 /* 100ms  xfer_limit is the limit that we should write each 100ms */
 #define BUFFER_DELAY 100

diff --git a/migration.h b/migration.h
index 40c8e9c..f4c4d1e 100644
--- a/migration.h
+++ b/migration.h
@@ -81,7 +81,6 @@ void migrate_fd_connect(MigrationState *s);

 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
 

[Qemu-devel] [PATCH 00/30] Migration thread 20121017 edition

2012-10-18 Thread Juan Quintela
Hi

This series apply on top of the refactoring that I sent yesterday.
Changes from the last version include:

- buffered_file.c is gone, its functionality is merged in migration.c
  special attention to the megre of buffered_file_thread() 
  migration_file_put_notify().

- Some more bitmap handling optimizations (thanks to Orit  Paolo for
  suggestions and code and Vinod for testing)

Please review.  Included is the pointer to the full tree.

Thanks, Juan.

The following changes since commit b6348f29d033d5a8a26f633d2ee94362595f32a4:

  target-arm/translate: Fix RRX operands (2012-10-17 19:56:46 +0200)

are available in the git repository at:

  http://repo.or.cz/r/qemu/quintela.git migration-thread-20121017

for you to fetch changes up to 486dabc29f56d8f0e692395d4a6cd483b3a77f01:

  ram: optimize migration bitmap walking (2012-10-18 09:20:34 +0200)


v3:

This is work in progress on top of the previous migration series just sent.

- Introduces a thread for migration instead of using a timer and callback
- remove the writting to the fd from the iothread lock
- make the writes synchronous
- Introduce a new pending method that returns how many bytes are pending for
  one save live section
- last patch just shows printfs to see where the time is being spent
  on the migration complete phase.
  (yes it pollutes all uses of stop on the monitor)

So far I have found that we spent a lot of time on bdrv_flush_all() It
can take from 1ms to 600ms (yes, it is not a typo).  That dwarfs the
migration default downtime time (30ms).

Stop all vcpus:

- it works now (after the changes on qemu_cpu_is_vcpu on the previous
  series) caveat is that the time that brdv_flush_all() takes is
  unpredictable.  Any silver bullets?

  Paolo suggested to call for migration completion phase:

  bdrv_aio_flush_all();
  Sent the dirty pages;
  bdrv_drain_all()
  brdv_flush_all()
  another round through the bitmap in case that completions have
  changed some page

  Paolo, did I get it right?
  Any other suggestion?

- migrate_cancel() is not properly implemented (as in the film that we
  take no locks, ...)

- expected_downtime is not calculated.

  I am about to merge migrate_fd_put_ready  buffered_thread() and
  that would make trivial to calculate.

It outputs something like:

wakeup_request 0
time cpu_disable_ticks 0
time pause_all_vcpus 1
time runstate_set 1
time vmstate_notify 2
time bdrv_drain_all 2
time flush device 
/dev/disk/by-path/ip-192.168.10.200:3260-iscsi-iqn.2010-12.org.trasno:iscsi.lvm-lun-1:
 3
time flush device : 3
time flush device : 3
time flush device : 3
time bdrv_flush_all 5
time monitor_protocol_event 5
vm_stop 2 5
synchronize_all_states 1
migrate RAM 37
migrate rest devices 1
complete without error 3a 44
completed 45
end completed stage 45

As you can see, we estimate that we can sent all pending data in 30ms,
it took 37ms to send the RAM (that is what we calculate).  So
estimation is quite good.

What it gives me lots of variation is on the line with device name of time 
flush device.
That is what varies between 1ms to 600ms

This is in a completely idle guest.  I am running:

while (1) {
uint64_t delay;

if (gettimeofday(t0, NULL) != 0)
perror(gettimeofday 1);
if (usleep(ms2us(10)) != 0)
perror(usleep);
if (gettimeofday(t1, NULL) != 0)
perror(gettimeofday 2);

t1.tv_usec -= t0.tv_usec;
if (t1.tv_usec  0) {
t1.tv_usec += 100;
t1.tv_sec--;
}
t1.tv_sec -= t0.tv_sec;

delay = t1.tv_sec * 1000 + t1.tv_usec/1000;

if (delay  100)
printf(delay of %ld ms\n, delay);
   }

To see the latency inside the guest (i.e. ask for a 10ms sleep, and see how 
long it takes).


[root@d1 ~]# ./timer 
delay of 161 ms
delay of 135 ms
delay of 143 ms
delay of 132 ms
delay of 131 ms
delay of 141 ms
delay of 113 ms
delay of 119 ms
delay of 114 ms


But that values are independent of migration.  Without even starting
the migration, idle guest doing nothing, we get it sometimes.



Juan Quintela (27):
  buffered_file: Move from using a timer to use a thread
  migration: make qemu_fopen_ops_buffered() return void
  migration: stop all cpus correctly
  migration: make writes blocking
  migration: remove unfreeze logic
  migration: take finer locking
  buffered_file: Unfold the trick to restart generating migration data
  buffered_file: don't flush on put buffer
  buffered_file: unfold buffered_append in buffered_put_buffer
  savevm: New save live migration method: pending
  migration: include qemu-file.h
  migration-fd: remove duplicate include
  migration: move buffered_file.c code into migration.c
  migration: move migration_fd_put_ready()
  migration: Inline qemu_fopen_ops_buffered into migrate_fd_connect
  migration: move 

[Qemu-devel] [PATCH 19/30] migration: move migration notifier

2012-10-18 Thread Juan Quintela
At this point, it is waranteed that state is ACTIVE.  Old position
didn't assured hat.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/migration.c b/migration.c
index 1f783e1..65f96b7 100644
--- a/migration.c
+++ b/migration.c
@@ -432,8 +432,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 }
 return;
 }
-
-notifier_list_notify(migration_state_notifiers, s);
 }

 void qmp_migrate_cancel(Error **errp)
@@ -779,4 +777,5 @@ void migrate_fd_connect(MigrationState *migration_state)

 qemu_thread_create(s-thread, buffered_file_thread, s,
QEMU_THREAD_DETACHED);
+notifier_list_notify(migration_state_notifiers, s);
 }
-- 
1.7.11.7




[Qemu-devel] [PATCH 27/30] ram: Use memory_region_test_and_clear_dirty

2012-10-18 Thread Juan Quintela
This avoids having to do two walks over the dirty bitmap, once reading
the dirty bits, and anthoer cleaning them.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 6f39ebd..8391375 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -390,13 +390,12 @@ static void migration_bitmap_sync(void)

 QLIST_FOREACH(block, ram_list.blocks, next) {
 for (addr = 0; addr  block-length; addr += TARGET_PAGE_SIZE) {
-if (memory_region_get_dirty(block-mr, addr, TARGET_PAGE_SIZE,
-DIRTY_MEMORY_MIGRATION)) {
+if (memory_region_test_and_clear_dirty(block-mr,
+   addr, TARGET_PAGE_SIZE,
+   DIRTY_MEMORY_MIGRATION)) {
 migration_bitmap_set_dirty(block-mr, addr);
 }
 }
-memory_region_reset_dirty(block-mr, 0, block-length,
-  DIRTY_MEMORY_MIGRATION);
 }
 trace_migration_bitmap_sync_end(migration_dirty_pages
 - num_dirty_pages_init);
-- 
1.7.11.7




[Qemu-devel] [PATCH 25/30] ram: Add last_sent_block

2012-10-18 Thread Juan Quintela
This is the last block from where we have sent data.

Signed-off-by: Orit Wasserman owass...@redhat.com
Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch_init.c b/arch_init.c
index 8ac4c94..6f39ebd 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -336,6 +336,8 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t 
*current_data,
 /* This is the last block that we have visited serching for dirty pages
  */
 static RAMBlock *last_seen_block;
+/* This is the last block from where we have sent data */
+static RAMBlock *last_sent_block;
 static ram_addr_t last_offset;
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
@@ -433,7 +435,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 mr = block-mr;
 if (migration_bitmap_test_and_reset_dirty(mr, offset)) {
 uint8_t *p;
-int cont = (block == last_seen_block) ?
+int cont = (block == last_sent_block) ?
 RAM_SAVE_FLAG_CONTINUE : 0;

 p = memory_region_get_ram_ptr(mr) + offset;
@@ -462,6 +464,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)

 /* if page is unmodified, continue to the next */
 if (bytes_sent != 0) {
+last_sent_block = block;
 break;
 }
 }
@@ -560,6 +563,7 @@ static void ram_migration_cancel(void *opaque)
 static void reset_ram_globals(void)
 {
 last_seen_block = NULL;
+last_sent_block = NULL;
 last_offset = 0;
 last_version = ram_list.version;
 sort_ram_list();
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 2/2] Add check_rxov into VMState.

2012-10-18 Thread Dmitry Fleytman
Oops, you are right :)

On Thu, Oct 18, 2012 at 9:24 AM, Stefan Hajnoczi stefa...@gmail.com wrote:
 On Wed, Oct 17, 2012 at 08:31:47PM +0200, Dmitry Fleytman wrote:
 @@ -1100,6 +1100,11 @@ static bool is_version_1(void *opaque, int version_id)
  return version_id == 1;
  }

 +static bool is_version_3(void *opaque, int version_id)
 +{
 +return version_id == 1;
 +}

 version_id == 3?



-- 
Dmitry Fleytman
Technology Expert and Consultant,

Daynix Computing Ltd.

Cell: +972-54-2819481

Skype: dmitry.fleytman



[Qemu-devel] [PATCH 18/30] migration: Inline qemu_fopen_ops_buffered into migrate_fd_connect

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 9 ++---
 migration.h | 2 --
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/migration.c b/migration.c
index a99653e..1f783e1 100644
--- a/migration.c
+++ b/migration.c
@@ -345,12 +345,6 @@ bool migration_has_failed(MigrationState *s)
 s-state == MIG_STATE_ERROR);
 }

-void migrate_fd_connect(MigrationState *s)
-{
-s-state = MIG_STATE_ACTIVE;
-qemu_fopen_ops_buffered(s);
-}
-
 static MigrationState *migrate_init(const MigrationParams *params)
 {
 MigrationState *s = migrate_get_current();
@@ -765,10 +759,11 @@ static void *buffered_file_thread(void *opaque)
 return NULL;
 }

-void qemu_fopen_ops_buffered(MigrationState *migration_state)
+void migrate_fd_connect(MigrationState *migration_state)
 {
 QEMUFileBuffered *s;

+migration_state-state = MIG_STATE_ACTIVE;
 s = g_malloc0(sizeof(*s));

 s-migration_state = migration_state;
diff --git a/migration.h b/migration.h
index f4c4d1e..60a2335 100644
--- a/migration.h
+++ b/migration.h
@@ -127,6 +127,4 @@ int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);

 int64_t xbzrle_cache_resize(int64_t new_size);
-
-void qemu_fopen_ops_buffered(MigrationState *migration_state);
 #endif
-- 
1.7.11.7




[Qemu-devel] [PATCH 01/30] split MRU ram list

2012-10-18 Thread Juan Quintela
From: Paolo Bonzini pbonz...@redhat.com

Outside the execution threads the normal, non-MRU-ized order of
the RAM blocks should always be enough.  So manage two separate
lists, which will have separate locking rules.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c |  1 +
 cpu-all.h   |  4 +++-
 exec.c  | 18 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index e6effe8..4293557 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -48,6 +48,7 @@
 #include qemu/page_cache.h
 #include qmp-commands.h
 #include trace.h
+#include cpu-all.h

 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
diff --git a/cpu-all.h b/cpu-all.h
index 6aa7e58..6558a6f 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -490,8 +490,9 @@ typedef struct RAMBlock {
 ram_addr_t offset;
 ram_addr_t length;
 uint32_t flags;
-char idstr[256];
+QLIST_ENTRY(RAMBlock) next_mru;
 QLIST_ENTRY(RAMBlock) next;
+char idstr[256];
 #if defined(__linux__)  !defined(TARGET_S390X)
 int fd;
 #endif
@@ -499,6 +500,7 @@ typedef struct RAMBlock {

 typedef struct RAMList {
 uint8_t *phys_dirty;
+QLIST_HEAD(, RAMBlock) blocks_mru;
 QLIST_HEAD(, RAMBlock) blocks;
 } RAMList;
 extern RAMList ram_list;
diff --git a/exec.c b/exec.c
index e63ad0d..718bbc2 100644
--- a/exec.c
+++ b/exec.c
@@ -56,6 +56,7 @@
 #include xen-mapcache.h
 #include trace.h
 #endif
+#include cpu-all.h

 #include cputlb.h

@@ -112,7 +113,10 @@ static uint8_t *code_gen_ptr;
 int phys_ram_fd;
 static int in_migration;

-RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
+RAMList ram_list = {
+.blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks),
+.blocks_mru = QLIST_HEAD_INITIALIZER(ram_list.blocks_mru)
+};

 static MemoryRegion *system_memory;
 static MemoryRegion *system_io;
@@ -633,6 +637,7 @@ bool tcg_enabled(void)
 void cpu_exec_init_all(void)
 {
 #if !defined(CONFIG_USER_ONLY)
+qemu_mutex_init(ram_list.mutex);
 memory_map_init();
 io_mem_init();
 #endif
@@ -2568,6 +2573,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void 
*host,
 new_block-length = size;

 QLIST_INSERT_HEAD(ram_list.blocks, new_block, next);
+QLIST_INSERT_HEAD(ram_list.blocks_mru, new_block, next_mru);

 ram_list.phys_dirty = g_realloc(ram_list.phys_dirty,
last_ram_offset()  TARGET_PAGE_BITS);
@@ -2595,6 +2601,7 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
 QLIST_FOREACH(block, ram_list.blocks, next) {
 if (addr == block-offset) {
 QLIST_REMOVE(block, next);
+QLIST_REMOVE(block, next_mru);
 g_free(block);
 return;
 }
@@ -2608,6 +2615,7 @@ void qemu_ram_free(ram_addr_t addr)
 QLIST_FOREACH(block, ram_list.blocks, next) {
 if (addr == block-offset) {
 QLIST_REMOVE(block, next);
+QLIST_REMOVE(block, next_mru);
 if (block-flags  RAM_PREALLOC_MASK) {
 ;
 } else if (mem_path) {
@@ -2713,12 +2721,12 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
 {
 RAMBlock *block;

-QLIST_FOREACH(block, ram_list.blocks, next) {
+QLIST_FOREACH(block, ram_list.blocks_mru, next_mru) {
 if (addr - block-offset  block-length) {
 /* Move this entry to to start of the list.  */
 if (block != QLIST_FIRST(ram_list.blocks)) {
-QLIST_REMOVE(block, next);
-QLIST_INSERT_HEAD(ram_list.blocks, block, next);
+QLIST_REMOVE(block, next_mru);
+QLIST_INSERT_HEAD(ram_list.blocks_mru, block, next_mru);
 }
 if (xen_enabled()) {
 /* We need to check if the requested address is in the RAM
@@ -2813,7 +2821,7 @@ int qemu_ram_addr_from_host(void *ptr, ram_addr_t 
*ram_addr)
 return 0;
 }

-QLIST_FOREACH(block, ram_list.blocks, next) {
+QLIST_FOREACH(block, ram_list.blocks_mru, next_mru) {
 /* This case append when the block is not mapped. */
 if (block-host == NULL) {
 continue;
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 1/6] qdev: rework device properties.

2012-10-18 Thread Paolo Bonzini
Il 17/10/2012 23:50, Eduardo Habkost ha scritto:
 I suppose that's the usual assumption when the file doesn't have an
 explicit license, as it's the license specified on the LICENSE file.
 
 The only problem is that the LICENSE file doesn't specify the GPL
 version, so it's a bit complicated. Some opinions can be found here:
 http://article.gmane.org/gmane.comp.emulators.qemu/122665.
 
 Unless every single author replies and accepts the adoption of another
 license (which I find very unlikely), I plan to submit a patch adding a
 GPLv2+ license header.

GPLv2+ should be the default license for files without a heading, except
if the history of the file shows that the code came originally from the
Linux kernel or another GPLv2-only project/file.

Paolo




[Qemu-devel] [PATCH 00/30] Migration thread 20121017 edition

2012-10-18 Thread Juan Quintela
Hi

This series apply on top of the refactoring that I sent yesterday.
Changes from the last version include:

- buffered_file.c is gone, its functionality is merged in migration.c
  special attention to the megre of buffered_file_thread() 
  migration_file_put_notify().

- Some more bitmap handling optimizations (thanks to Orit  Paolo for
  suggestions and code and Vinod for testing)

Please review.  Included is the pointer to the full tree.

Thanks, Juan.

The following changes since commit b6348f29d033d5a8a26f633d2ee94362595f32a4:

  target-arm/translate: Fix RRX operands (2012-10-17 19:56:46 +0200)

are available in the git repository at:

  http://repo.or.cz/r/qemu/quintela.git migration-thread-20121017

for you to fetch changes up to 486dabc29f56d8f0e692395d4a6cd483b3a77f01:

  ram: optimize migration bitmap walking (2012-10-18 09:20:34 +0200)


v3:

This is work in progress on top of the previous migration series just sent.

- Introduces a thread for migration instead of using a timer and callback
- remove the writting to the fd from the iothread lock
- make the writes synchronous
- Introduce a new pending method that returns how many bytes are pending for
  one save live section
- last patch just shows printfs to see where the time is being spent
  on the migration complete phase.
  (yes it pollutes all uses of stop on the monitor)

So far I have found that we spent a lot of time on bdrv_flush_all() It
can take from 1ms to 600ms (yes, it is not a typo).  That dwarfs the
migration default downtime time (30ms).

Stop all vcpus:

- it works now (after the changes on qemu_cpu_is_vcpu on the previous
  series) caveat is that the time that brdv_flush_all() takes is
  unpredictable.  Any silver bullets?

  Paolo suggested to call for migration completion phase:

  bdrv_aio_flush_all();
  Sent the dirty pages;
  bdrv_drain_all()
  brdv_flush_all()
  another round through the bitmap in case that completions have
  changed some page

  Paolo, did I get it right?
  Any other suggestion?

- migrate_cancel() is not properly implemented (as in the film that we
  take no locks, ...)

- expected_downtime is not calculated.

  I am about to merge migrate_fd_put_ready  buffered_thread() and
  that would make trivial to calculate.

It outputs something like:

wakeup_request 0
time cpu_disable_ticks 0
time pause_all_vcpus 1
time runstate_set 1
time vmstate_notify 2
time bdrv_drain_all 2
time flush device 
/dev/disk/by-path/ip-192.168.10.200:3260-iscsi-iqn.2010-12.org.trasno:iscsi.lvm-lun-1:
 3
time flush device : 3
time flush device : 3
time flush device : 3
time bdrv_flush_all 5
time monitor_protocol_event 5
vm_stop 2 5
synchronize_all_states 1
migrate RAM 37
migrate rest devices 1
complete without error 3a 44
completed 45
end completed stage 45

As you can see, we estimate that we can sent all pending data in 30ms,
it took 37ms to send the RAM (that is what we calculate).  So
estimation is quite good.

What it gives me lots of variation is on the line with device name of time 
flush device.
That is what varies between 1ms to 600ms

This is in a completely idle guest.  I am running:

while (1) {
uint64_t delay;

if (gettimeofday(t0, NULL) != 0)
perror(gettimeofday 1);
if (usleep(ms2us(10)) != 0)
perror(usleep);
if (gettimeofday(t1, NULL) != 0)
perror(gettimeofday 2);

t1.tv_usec -= t0.tv_usec;
if (t1.tv_usec  0) {
t1.tv_usec += 100;
t1.tv_sec--;
}
t1.tv_sec -= t0.tv_sec;

delay = t1.tv_sec * 1000 + t1.tv_usec/1000;

if (delay  100)
printf(delay of %ld ms\n, delay);
   }

To see the latency inside the guest (i.e. ask for a 10ms sleep, and see how 
long it takes).


[root@d1 ~]# ./timer 
delay of 161 ms
delay of 135 ms
delay of 143 ms
delay of 132 ms
delay of 131 ms
delay of 141 ms
delay of 113 ms
delay of 119 ms
delay of 114 ms


But that values are independent of migration.  Without even starting
the migration, idle guest doing nothing, we get it sometimes.

Juan Quintela (27):
  buffered_file: Move from using a timer to use a thread
  migration: make qemu_fopen_ops_buffered() return void
  migration: stop all cpus correctly
  migration: make writes blocking
  migration: remove unfreeze logic
  migration: take finer locking
  buffered_file: Unfold the trick to restart generating migration data
  buffered_file: don't flush on put buffer
  buffered_file: unfold buffered_append in buffered_put_buffer
  savevm: New save live migration method: pending
  migration: include qemu-file.h
  migration-fd: remove duplicate include
  migration: move buffered_file.c code into migration.c
  migration: move migration_fd_put_ready()
  migration: Inline qemu_fopen_ops_buffered into migrate_fd_connect
  migration: move 

Re: [Qemu-devel] [PATCH 1/2] Ignore RX tail kicks when RX disabled.

2012-10-18 Thread Stefan Hajnoczi
On Wed, Oct 17, 2012 at 08:31:46PM +0200, Dmitry Fleytman wrote:
 Device RX initization from driver's side consists of following steps:
   1. Initialize head and tail of RX ring to 0
   2. Enable Rx (set bit in RCTL register)
   3. Allocate buffers, fill descriptors
   4. Write ring tail
 
 Forth operation signals hardware that RX buffers available
 and it may start packets indication.
 
 Current implementation treats first operation (write 0 to ring tail)
 as signal of buffers availability and starts data transfers as soon
 as RX enable indicaton arrives.
 
 This is not correct because there is a chance that ring is still
 empty (third action not performed yet) and then memory corruption
 occures.

Any idea what the point of hw/e1000.c check_rxov is?  I see nothing in
the datasheet that requires these semantics.

The Linux e1000 driver never enables the RXO (rx fifo overflow)
interrupt, only RXDMT0 (receive descriptor minimum threshold).  This
means hw/e1000.c will not upset the Linux e1000 driver when
e1000_receive() gets called with check_rxov == 1 and RDH == RDT == 0.

BTW the Linux e1000 driver does not follow the sequence recommended in
the datasheet 14.4 Receive Initialization, which would avoid the weird
window of time where RDH == RDT == 0.

If we get rid of check_rxov and always check rxbuf space then we have
the correct behavior.  I'm a little nervous of simply dropping it
because its purpose is unclear to me :(.

Stefan



[Qemu-devel] [PATCH 10/30] buffered_file: Unfold the trick to restart generating migration data

2012-10-18 Thread Juan Quintela
This was needed before due to the way that the callbacks worked.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index a3aba2a..7692950 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -113,14 +113,6 @@ static int buffered_put_buffer(void *opaque, const uint8_t 
*buf, int64_t pos, in
 return error;
 }

-if (pos == 0  size == 0) {
-DPRINTF(file is ready\n);
-if (s-bytes_xfer  s-xfer_limit) {
-DPRINTF(notifying client\n);
-migrate_fd_put_ready(s-migration_state);
-}
-}
-
 return size;
 }

@@ -216,8 +208,15 @@ static void *buffered_file_thread(void *opaque)
 /* usleep expects microseconds */
 usleep((expire_time - current_time)*1000);
 }
-buffered_put_buffer(s, NULL, 0, 0);
+buffered_flush(s);
+
+DPRINTF(file is ready\n);
+if (s-bytes_xfer  s-xfer_limit) {
+DPRINTF(notifying client\n);
+migrate_fd_put_ready(s-migration_state);
+}
 }
+
 g_free(s-buffer);
 g_free(s);
 return NULL;
-- 
1.7.11.7




[Qemu-devel] [PATCH 09/30] migration: take finer locking

2012-10-18 Thread Juan Quintela
Instead of locking the whole migration_thread inside loop, just lock
migration_fd_put_notify, that is what interacts with the rest of the
world.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 2 --
 migration.c | 5 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 876cc8c..a3aba2a 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -216,9 +216,7 @@ static void *buffered_file_thread(void *opaque)
 /* usleep expects microseconds */
 usleep((expire_time - current_time)*1000);
 }
-qemu_mutex_lock_iothread();
 buffered_put_buffer(s, NULL, 0, 0);
-qemu_mutex_unlock_iothread();
 }
 g_free(s-buffer);
 g_free(s);
diff --git a/migration.c b/migration.c
index 29ee710..82c2663 100644
--- a/migration.c
+++ b/migration.c
@@ -305,8 +305,10 @@ void migrate_fd_put_ready(MigrationState *s)
 int ret;
 static bool first_time = true;

+qemu_mutex_lock_iothread();
 if (s-state != MIG_STATE_ACTIVE) {
 DPRINTF(put_ready returning because of non-active state\n);
+qemu_mutex_unlock_iothread();
 return;
 }
 if (first_time) {
@@ -316,6 +318,7 @@ void migrate_fd_put_ready(MigrationState *s)
 if (ret  0) {
 DPRINTF(failed, %d\n, ret);
 migrate_fd_error(s);
+qemu_mutex_unlock_iothread();
 return;
 }
 }
@@ -351,6 +354,8 @@ void migrate_fd_put_ready(MigrationState *s)
 }
 }
 }
+qemu_mutex_unlock_iothread();
+
 }

 static void migrate_fd_cancel(MigrationState *s)
-- 
1.7.11.7




[Qemu-devel] [PATCH 15/30] migration-fd: remove duplicate include

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration-fd.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/migration-fd.c b/migration-fd.c
index 25f0245..8f6b55f 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -20,7 +20,6 @@
 #include qemu-char.h
 #include qemu-file.h
 #include block.h
-#include qemu_socket.h

 //#define DEBUG_MIGRATION_FD

-- 
1.7.11.7




[Qemu-devel] [PATCH 16/30] migration: move buffered_file.c code into migration.c

2012-10-18 Thread Juan Quintela
This only moves the code (also from buffered_file.h to migration.h).
Fix whitespace until checkpatch is happy.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 Makefile.objs   |   2 +-
 buffered_file.c | 244 
 buffered_file.h |  22 -
 migration.c | 218 +-
 migration.h |   1 +
 5 files changed, 219 insertions(+), 268 deletions(-)
 delete mode 100644 buffered_file.c
 delete mode 100644 buffered_file.h

diff --git a/Makefile.objs b/Makefile.objs
index 74b3542..3de8f27 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -73,7 +73,7 @@ extra-obj-$(CONFIG_LINUX) += fsdev/

 common-obj-y += tcg-runtime.o host-utils.o main-loop.o
 common-obj-y += input.o
-common-obj-y += buffered_file.o migration.o migration-tcp.o
+common-obj-y += migration.o migration-tcp.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o iohandler.o
 common-obj-y += pflib.o
diff --git a/buffered_file.c b/buffered_file.c
deleted file mode 100644
index c21f847..000
--- a/buffered_file.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * QEMU buffered QEMUFile
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   aligu...@us.ibm.com
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-
-#include qemu-common.h
-#include hw/hw.h
-#include qemu-timer.h
-#include qemu-char.h
-#include buffered_file.h
-#include qemu-thread.h
-
-//#define DEBUG_BUFFERED_FILE
-
-typedef struct QEMUFileBuffered
-{
-MigrationState *migration_state;
-QEMUFile *file;
-size_t bytes_xfer;
-size_t xfer_limit;
-uint8_t *buffer;
-size_t buffer_size;
-size_t buffer_capacity;
-QemuThread thread;
-} QEMUFileBuffered;
-
-#ifdef DEBUG_BUFFERED_FILE
-#define DPRINTF(fmt, ...) \
-do { printf(buffered-file:  fmt, ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) \
-do { } while (0)
-#endif
-
-static ssize_t buffered_flush(QEMUFileBuffered *s)
-{
-size_t offset = 0;
-ssize_t ret = 0;
-
-DPRINTF(flushing %zu byte(s) of data\n, s-buffer_size);
-
-while (s-bytes_xfer  s-xfer_limit  offset  s-buffer_size) {
-
-ret = migrate_fd_put_buffer(s-migration_state, s-buffer + offset,
-s-buffer_size - offset);
-if (ret = 0) {
-DPRINTF(error flushing data, %zd\n, ret);
-break;
-} else {
-DPRINTF(flushed %zd byte(s)\n, ret);
-offset += ret;
-s-bytes_xfer += ret;
-}
-}
-
-DPRINTF(flushed %zu of %zu byte(s)\n, offset, s-buffer_size);
-memmove(s-buffer, s-buffer + offset, s-buffer_size - offset);
-s-buffer_size -= offset;
-
-if (ret  0) {
-return ret;
-}
-return offset;
-}
-
-static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, 
int size)
-{
-QEMUFileBuffered *s = opaque;
-ssize_t error;
-
-DPRINTF(putting %d bytes at % PRId64 \n, size, pos);
-
-error = qemu_file_get_error(s-file);
-if (error) {
-DPRINTF(flush when error, bailing: %s\n, strerror(-error));
-return error;
-}
-
-if (size = 0) {
-return size;
-}
-
-if (size  (s-buffer_capacity - s-buffer_size)) {
-DPRINTF(increasing buffer capacity from %zu by %zu\n,
-s-buffer_capacity, size + 1024);
-
-s-buffer_capacity += size + 1024;
-
-s-buffer = g_realloc(s-buffer, s-buffer_capacity);
-}
-
-memcpy(s-buffer + s-buffer_size, buf, size);
-s-buffer_size += size;
-
-return size;
-}
-
-static int buffered_close(void *opaque)
-{
-QEMUFileBuffered *s = opaque;
-ssize_t ret = 0;
-int ret2;
-
-DPRINTF(closing\n);
-
-s-xfer_limit = INT_MAX;
-while (!qemu_file_get_error(s-file)  s-buffer_size) {
-ret = buffered_flush(s);
-if (ret  0) {
-break;
-}
-}
-
-ret2 = migrate_fd_close(s-migration_state);
-if (ret = 0) {
-ret = ret2;
-}
-ret = migrate_fd_close(s-migration_state);
-s-migration_state-complete = true;
-return ret;
-}
-
-/*
- * The meaning of the return values is:
- *   0: We can continue sending
- *   1: Time to stop
- *   negative: There has been an error
- */
-static int buffered_rate_limit(void *opaque)
-{
-QEMUFileBuffered *s = opaque;
-int ret;
-
-ret = qemu_file_get_error(s-file);
-if (ret) {
-return ret;
-}
-
-if (s-bytes_xfer  s-xfer_limit)
-return 1;
-
-return 0;
-}
-
-static int64_t buffered_set_rate_limit(void *opaque, int64_t new_rate)
-{
-QEMUFileBuffered *s = opaque;
-if (qemu_file_get_error(s-file)) {
-goto out;
-}
-if (new_rate  SIZE_MAX) {
-

[Qemu-devel] [PATCH 28/30] fix memory.c

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 memory.c | 4 +++-
 memory.h | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/memory.c b/memory.c
index 126fb63..4d0fa96 100644
--- a/memory.c
+++ b/memory.c
@@ -1089,7 +1089,9 @@ bool memory_region_test_and_clear_dirty(MemoryRegion *mr,
 ret = cpu_physical_memory_get_dirty(mr-ram_addr + addr, size,
 1  client);
 if (ret) {
-cpu_physical_memory_set_dirty_range(mr-ram_addr + addr, size, -1);
+cpu_physical_memory_reset_dirty(mr-ram_addr + addr,
+mr-ram_addr + addr + size,
+1  client);
 }
 return ret;
 }
diff --git a/memory.h b/memory.h
index 08af012..0dcc0f4 100644
--- a/memory.h
+++ b/memory.h
@@ -449,7 +449,8 @@ void memory_region_set_dirty(MemoryRegion *mr, 
target_phys_addr_t addr,
  */
 bool memory_region_test_and_clear_dirty(MemoryRegion *mr,
 target_phys_addr_t addr,
-target_phys_addr_t size, unsigned 
client)
+target_phys_addr_t size,
+unsigned client);
 /**
  * memory_region_sync_dirty_bitmap: Synchronize a region's dirty bitmap with
  *  any external TLBs (e.g. kvm)
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 13/15] target-pcc: Convert ppcemb_tlb_t to use fixed 64-bit RPN

2012-10-18 Thread David Gibson
On Thu, Oct 18, 2012 at 08:37:20AM +0200, Alexander Graf wrote:
 
 
 On 18.10.2012, at 07:50, David Gibson da...@gibson.dropbear.id.au wrote:
 
  Currently the ppcemb_tlb_t struct, used on a number of embedded ppc models
  to represent a TLB entry contains a target_phys_addr_t.  That works
  reasonably for now, but is troublesome for saving the state, which we'll
  want to do in future.  target_phys_addr_t is a large enough type to contain
  a physical address for any supported machine - and can thus, in theory at
  least, vary depending on what machines are enabled other than the one
  we're actually using right now.  This makes it unsuitable for describing
  in vmstate.
 
 Target_phys_addr_t is actually 64bit for all ppc targets today since
 some 32 bit boards support more than 32 bit address space ;).

Yes, I know.  In fact since recently it's 64bit always on everything.

 The change still is fine though, as it makes that bit explicit.

Yes.  What this is leading to is the new savevm code - there are no
vmstate helpers for target_phys_addr_t and my attempt to add them met
with at least semi-convincing arguments against.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson



[Qemu-devel] [PATCH 03/30] protect the ramlist with a separate mutex

2012-10-18 Thread Juan Quintela
From: Umesh Deshpande udesh...@redhat.com

Add the new mutex that protects shared state between ram_save_live
and the iothread.  If the iothread mutex has to be taken together
with the ramlist mutex, the iothread shall always be _outside_.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Umesh Deshpande udesh...@redhat.com
Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c |  9 -
 cpu-all.h   |  8 
 exec.c  | 23 +--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index b47313d..2d29828 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -553,7 +553,6 @@ static void ram_migration_cancel(void *opaque)
 migration_end();
 }

-
 static void reset_ram_globals(void)
 {
 last_block = NULL;
@@ -573,6 +572,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 bitmap_set(migration_bitmap, 1, ram_pages);
 migration_dirty_pages = ram_pages;

+qemu_mutex_lock_ramlist();
 bytes_transferred = 0;
 reset_ram_globals();

@@ -600,6 +600,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_put_be64(f, block-length);
 }

+qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

 return 0;
@@ -614,6 +615,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 uint64_t expected_downtime;
 MigrationState *s = migrate_get_current();

+qemu_mutex_lock_ramlist();
+
 if (ram_list.version != last_version) {
 reset_ram_globals();
 }
@@ -662,6 +665,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 bwidth = 0.01;
 }

+qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

 expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
@@ -682,6 +686,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 {
 migration_bitmap_sync();

+qemu_mutex_lock_ramlist();
+
 /* try transferring iterative blocks of memory */

 /* flush all remaining blocks regardless of rate limiting */
@@ -697,6 +703,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 }
 memory_global_dirty_log_stop();

+qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

 g_free(migration_bitmap);
diff --git a/cpu-all.h b/cpu-all.h
index e07c91c..2bafcdd 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -22,6 +22,7 @@
 #include qemu-common.h
 #include qemu-tls.h
 #include cpu-common.h
+#include qemu-thread.h

 /* some important defines:
  *
@@ -490,7 +491,9 @@ typedef struct RAMBlock {
 ram_addr_t offset;
 ram_addr_t length;
 uint32_t flags;
+/* Protected by the iothread lock.  */
 QLIST_ENTRY(RAMBlock) next_mru;
+/* Protected by the ramlist lock.  */
 QLIST_ENTRY(RAMBlock) next;
 char idstr[256];
 #if defined(__linux__)  !defined(TARGET_S390X)
@@ -499,9 +502,12 @@ typedef struct RAMBlock {
 } RAMBlock;

 typedef struct RAMList {
+QemuMutex mutex;
+/* Protected by the iothread lock.  */
 uint8_t *phys_dirty;
 uint32_t version;
 QLIST_HEAD(, RAMBlock) blocks_mru;
+/* Protected by the ramlist lock.  */
 QLIST_HEAD(, RAMBlock) blocks;
 } RAMList;
 extern RAMList ram_list;
@@ -521,6 +527,8 @@ extern int mem_prealloc;

 void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 ram_addr_t last_ram_offset(void);
+void qemu_mutex_lock_ramlist(void);
+void qemu_mutex_unlock_ramlist(void);
 #endif /* !CONFIG_USER_ONLY */

 int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
diff --git a/exec.c b/exec.c
index 1e04711..cf9de92 100644
--- a/exec.c
+++ b/exec.c
@@ -637,6 +637,7 @@ bool tcg_enabled(void)
 void cpu_exec_init_all(void)
 {
 #if !defined(CONFIG_USER_ONLY)
+qemu_mutex_init(ram_list.mutex);
 memory_map_init();
 io_mem_init();
 #endif
@@ -2329,6 +2330,16 @@ void qemu_flush_coalesced_mmio_buffer(void)
 kvm_flush_coalesced_mmio_buffer();
 }

+void qemu_mutex_lock_ramlist(void)
+{
+qemu_mutex_lock(ram_list.mutex);
+}
+
+void qemu_mutex_unlock_ramlist(void)
+{
+qemu_mutex_unlock(ram_list.mutex);
+}
+
 #if defined(__linux__)  !defined(TARGET_S390X)

 #include sys/vfs.h
@@ -2510,6 +2521,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char 
*name, DeviceState *dev)
 }
 pstrcat(new_block-idstr, sizeof(new_block-idstr), name);

+qemu_mutex_lock_ramlist();
 QLIST_FOREACH(block, ram_list.blocks, next) {
 if (block != new_block  !strcmp(block-idstr, new_block-idstr)) {
 fprintf(stderr, RAMBlock \%s\ already registered, abort!\n,
@@ -2517,6 +2529,7 @@ void qemu_ram_set_idstr(ram_addr_t addr, const char 
*name, DeviceState *dev)
 abort();
 }
 }
+qemu_mutex_unlock_ramlist();
 }

 static int memory_try_enable_merging(void *addr, size_t len)
@@ -2540,6 +2553,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void 
*host,
 size = TARGET_PAGE_ALIGN(size);
 new_block = 

[Qemu-devel] [PATCH 30/30] ram: optimize migration bitmap walking

2012-10-18 Thread Juan Quintela
Instead of testing each page individually, we search what is the next
dirty page with a bitmap operation.  We have to reorganize the code to
move from a for loop, to a while(dirty) loop.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c | 45 ++---
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 8391375..cd6ebef 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -343,18 +343,21 @@ static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;

-static inline bool migration_bitmap_test_and_reset_dirty(MemoryRegion *mr,
- ram_addr_t offset)
+static inline
+ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
+ ram_addr_t start)
 {
-bool ret;
-int nr = (mr-ram_addr + offset)  TARGET_PAGE_BITS;
+unsigned long base = mr-ram_addr  TARGET_PAGE_BITS;
+unsigned long nr = base + (start  TARGET_PAGE_BITS);
+unsigned long size = base + (int128_get64(mr-size)  TARGET_PAGE_BITS);

-ret = test_and_clear_bit(nr, migration_bitmap);
+unsigned long next = find_next_bit(migration_bitmap, size, nr);

-if (ret) {
+if (next  size) {
+clear_bit(next, migration_bitmap);
 migration_dirty_pages--;
 }
-return ret;
+return (next - base)  TARGET_PAGE_BITS;
 }

 static inline bool migration_bitmap_set_dirty(MemoryRegion *mr,
@@ -423,6 +426,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 {
 RAMBlock *block = last_seen_block;
 ram_addr_t offset = last_offset;
+bool complete_round = false;
 int bytes_sent = -1;
 MemoryRegion *mr;
 ram_addr_t current_addr;
@@ -430,9 +434,21 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 if (!block)
 block = QLIST_FIRST(ram_list.blocks);

-do {
+while(true) {
 mr = block-mr;
-if (migration_bitmap_test_and_reset_dirty(mr, offset)) {
+offset = migration_bitmap_find_and_reset_dirty(mr, offset);
+if (complete_round  block == last_seen_block 
+offset = last_offset) {
+break;
+}
+if (offset = block-length) {
+offset = 0;
+block = QLIST_NEXT(block, next);
+if (!block) {
+block = QLIST_FIRST(ram_list.blocks);
+complete_round = true;
+}
+} else {
 uint8_t *p;
 int cont = (block == last_sent_block) ?
 RAM_SAVE_FLAG_CONTINUE : 0;
@@ -467,16 +483,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 break;
 }
 }
-
-offset += TARGET_PAGE_SIZE;
-if (offset = block-length) {
-offset = 0;
-block = QLIST_NEXT(block, next);
-if (!block)
-block = QLIST_FIRST(ram_list.blocks);
-}
-} while (block != last_seen_block || offset != last_offset);
-
+}
 last_seen_block = block;
 last_offset = offset;

-- 
1.7.11.7




[Qemu-devel] [PATCH 24/30] ram: rename last_block to last_seen_block

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 1eefef8..8ac4c94 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -332,7 +332,10 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t 
*current_data,
 return bytes_sent;
 }

-static RAMBlock *last_block;
+
+/* This is the last block that we have visited serching for dirty pages
+ */
+static RAMBlock *last_seen_block;
 static ram_addr_t last_offset;
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
@@ -417,7 +420,7 @@ static void migration_bitmap_sync(void)

 static int ram_save_block(QEMUFile *f, bool last_stage)
 {
-RAMBlock *block = last_block;
+RAMBlock *block = last_seen_block;
 ram_addr_t offset = last_offset;
 int bytes_sent = -1;
 MemoryRegion *mr;
@@ -430,7 +433,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 mr = block-mr;
 if (migration_bitmap_test_and_reset_dirty(mr, offset)) {
 uint8_t *p;
-int cont = (block == last_block) ? RAM_SAVE_FLAG_CONTINUE : 0;
+int cont = (block == last_seen_block) ?
+RAM_SAVE_FLAG_CONTINUE : 0;

 p = memory_region_get_ram_ptr(mr) + offset;

@@ -469,9 +473,9 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 if (!block)
 block = QLIST_FIRST(ram_list.blocks);
 }
-} while (block != last_block || offset != last_offset);
+} while (block != last_seen_block || offset != last_offset);

-last_block = block;
+last_seen_block = block;
 last_offset = offset;

 return bytes_sent;
@@ -555,7 +559,7 @@ static void ram_migration_cancel(void *opaque)

 static void reset_ram_globals(void)
 {
-last_block = NULL;
+last_seen_block = NULL;
 last_offset = 0;
 last_version = ram_list.version;
 sort_ram_list();
-- 
1.7.11.7




Re: [Qemu-devel] [RFC V2 00/20] QCOW2 deduplication

2012-10-18 Thread Benoît Canet
 Can you use glib GTree instead?

Ok

 
  I am posting this patchset in order to have an early feedback regarding the
  design.
 
 Since it's still not working, it's hard to tell how effective it is in
 practice.

I hope to have something working soon.

Best regards

Benoît



Re: [Qemu-devel] [PATCH 1/2] Ignore RX tail kicks when RX disabled.

2012-10-18 Thread Dmitry Fleytman
Stefan,

The real purpose of check_rxov it a bit confusing indeed, mainly
because of unclear name (rename?),
however it works as following:

There are 2 possible when RDT == RDH for RX ring:
1. Device used all the buffers from ring, no empty buffers available
2. Driver fully refilled the ring and all buffers are empty and ready to use

check_rxov is used to distinguish these 2 cases:
1. It must be 1 initially (init, reset, etc.)
2. It must be set to one when device uses buffer
3. It must be set to 0 when driver adds buffer to the ring
check_rxov == 1 - ring is empty
check_rxov == 0 - ring is full

Indeed, RX init sequence doesn't look logical, however this is the way
all Intel driver behave from e1000 and up to ixgbe.
Also see some explanation here:
http://permalink.gmane.org/gmane.linux.kernel/1375917

If we drop check_rxov and always treat RDH == RDT as empty ring we'll
probably get correct behavior for current Linux driver's code (needs
testing of course),
however we have no idea how Windows drivers work.

Also drivers tend to change...

Dmitry.

On Thu, Oct 18, 2012 at 10:09 AM, Stefan Hajnoczi stefa...@gmail.com wrote:
 On Wed, Oct 17, 2012 at 08:31:46PM +0200, Dmitry Fleytman wrote:
 Device RX initization from driver's side consists of following steps:
   1. Initialize head and tail of RX ring to 0
   2. Enable Rx (set bit in RCTL register)
   3. Allocate buffers, fill descriptors
   4. Write ring tail

 Forth operation signals hardware that RX buffers available
 and it may start packets indication.

 Current implementation treats first operation (write 0 to ring tail)
 as signal of buffers availability and starts data transfers as soon
 as RX enable indicaton arrives.

 This is not correct because there is a chance that ring is still
 empty (third action not performed yet) and then memory corruption
 occures.

 Any idea what the point of hw/e1000.c check_rxov is?  I see nothing in
 the datasheet that requires these semantics.

 The Linux e1000 driver never enables the RXO (rx fifo overflow)
 interrupt, only RXDMT0 (receive descriptor minimum threshold).  This
 means hw/e1000.c will not upset the Linux e1000 driver when
 e1000_receive() gets called with check_rxov == 1 and RDH == RDT == 0.

 BTW the Linux e1000 driver does not follow the sequence recommended in
 the datasheet 14.4 Receive Initialization, which would avoid the weird
 window of time where RDH == RDT == 0.

 If we get rid of check_rxov and always check rxbuf space then we have
 the correct behavior.  I'm a little nervous of simply dropping it
 because its purpose is unclear to me :(.

 Stefan



-- 
Dmitry Fleytman
Technology Expert and Consultant,

Daynix Computing Ltd.

Cell: +972-54-2819481

Skype: dmitry.fleytman



Re: [Qemu-devel] [PATCH 21/30] migration: move exit condition to migration thread

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:30, Juan Quintela ha scritto:
 -if (s-migration_state-complete) {
 +qemu_mutex_lock_iothread();

So, was it a bug that we were accessing -complete without the BQL?

 +if (m-state != MIG_STATE_ACTIVE) {
 +DPRINTF(put_ready returning because of non-active state\n);

The contents of the debug message obsolete.  Besides, I would just put
the two branches in the same if (m-state != MIG_STATE_ACTIVE ||
m-complete).

 +qemu_mutex_unlock_iothread();
  break;
  }
 +if (m-complete) {
 +qemu_mutex_unlock_iothread();
 +break;
 +}
 +qemu_mutex_unlock_iothread();
 +

Paolo



Re: [Qemu-devel] [PATCH 1/2] Ignore RX tail kicks when RX disabled.

2012-10-18 Thread Dmitry Fleytman
Hello, Stefan

The problem occurs between steps 2 and 3.
Let's say packet arrives after step 2 is done by driver.

Head and tail are 0 because of step 1

Check_rxov is 0 because of two reasons:
1. On startup it is 0 by default
2. It is zeroed by setting ring tail to 0 on first step

Then first check ( __if (!(s-mac_reg[RCTL]  E1000_RCTL_EN))__ )
passes because RX enabled on step 2.
e1000_has_rxbufs() returs true because it treats equal head and tail
as fully filled ring when check_rxov is 0:

static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
{
[...]

if (total_size = s-rxbuf_size) {
return s-mac_reg[RDH] != s-mac_reg[RDT] || !s-check_rxov;

[...]

} else if (s-mac_reg[RDH]  s-mac_reg[RDT] || !s-check_rxov) {

   [...]
}

So QEMU reads uninitialized descriptor and tries to perform DMA to
arbitrary address from descriptor.
Depending on address value it corrupts guest memory or abort()'s here:

void *qemu_get_ram_ptr(ram_addr_t addr)
{
[...]

fprintf(stderr, Bad ram offset % PRIx64 \n, (uint64_t)addr);
abort();

[...]
}

Thanks for review,
Dmitry.

On Thu, Oct 18, 2012 at 9:31 AM, Stefan Hajnoczi stefa...@gmail.com wrote:

 On Wed, Oct 17, 2012 at 08:31:46PM +0200, Dmitry Fleytman wrote:
  Device RX initization from driver's side consists of following steps:
1. Initialize head and tail of RX ring to 0
2. Enable Rx (set bit in RCTL register)
3. Allocate buffers, fill descriptors
4. Write ring tail
 
  Forth operation signals hardware that RX buffers available
  and it may start packets indication.
 
  Current implementation treats first operation (write 0 to ring tail)
  as signal of buffers availability and starts data transfers as soon
  as RX enable indicaton arrives.
 
  This is not correct because there is a chance that ring is still
  empty (third action not performed yet) and then memory corruption
  occures.

 The existing code tries to prevent this:

 e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
 [...]

 if (!(s-mac_reg[RCTL]  E1000_RCTL_EN))
 return -1;

 [...]
 total_size = size + fcs_len(s);
 if (!e1000_has_rxbufs(s, total_size)) {
 set_ics(s, 0, E1000_ICS_RXO);
 return -1;
 }

 Why are these checks not enough?

 Which memory gets corrupted?

 Stefan




--
Dmitry Fleytman
Technology Expert and Consultant,

Daynix Computing Ltd.

Cell: +972-54-2819481

Skype: dmitry.fleytman



Re: [Qemu-devel] [PATCH 22/30] migration: unfold rest of migrate_fd_put_ready() into thread

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:30, Juan Quintela ha scritto:
 This will allow us finer control in next patches.
 
 Signed-off-by: Juan Quintela quint...@redhat.com
 ---
  migration.c | 95 
 ++---
  1 file changed, 41 insertions(+), 54 deletions(-)
 
 diff --git a/migration.c b/migration.c
 index 7206866..e6ff1f1 100644
 --- a/migration.c
 +++ b/migration.c
 @@ -644,54 +644,6 @@ static int64_t buffered_get_rate_limit(void *opaque)
  return s-xfer_limit;
  }
 
 -static bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
 -{
 -int ret;
 -uint64_t pending_size;
 -bool last_round = false;
 -
 -qemu_mutex_lock_iothread();
 -DPRINTF(iterate\n);
 -pending_size = qemu_savevm_state_pending(s-file, max_size);
 -DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
 -if (pending_size = max_size) {
 -ret = qemu_savevm_state_iterate(s-file);
 -if (ret  0) {
 -migrate_fd_error(s);
 -}
 -} else {
 -int old_vm_running = runstate_is_running();
 -int64_t start_time, end_time;
 -
 -DPRINTF(done iterating\n);
 -start_time = qemu_get_clock_ms(rt_clock);
 -qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 -if (old_vm_running) {
 -vm_stop(RUN_STATE_FINISH_MIGRATE);
 -} else {
 -vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 -}
 -
 -if (qemu_savevm_state_complete(s-file)  0) {
 -migrate_fd_error(s);
 -} else {
 -migrate_fd_completed(s);
 -}
 -end_time = qemu_get_clock_ms(rt_clock);
 -s-total_time = end_time - s-total_time;
 -s-downtime = end_time - start_time;
 -if (s-state != MIG_STATE_COMPLETED) {
 -if (old_vm_running) {
 -vm_start();
 -}
 -}
 -last_round = true;
 -}
 -qemu_mutex_unlock_iothread();
 -
 -return last_round;
 -}
 -
  /* 100ms  xfer_limit is the limit that we should write each 100ms */
  #define BUFFER_DELAY 100
 
 @@ -716,6 +668,7 @@ static void *buffered_file_thread(void *opaque)
 
  while (true) {
  int64_t current_time = qemu_get_clock_ms(rt_clock);
 +uint64_t pending_size;
 
  qemu_mutex_lock_iothread();
  if (m-state != MIG_STATE_ACTIVE) {
 @@ -727,6 +680,46 @@ static void *buffered_file_thread(void *opaque)
  qemu_mutex_unlock_iothread();
  break;
  }
 +if (s-bytes_xfer  s-xfer_limit) {
 +DPRINTF(iterate\n);
 +pending_size = qemu_savevm_state_pending(m-file, max_size);
 +DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
 +if (pending_size = max_size) {
 +ret = qemu_savevm_state_iterate(m-file);

So RAM migration is still being run inside the BQL, isn't it?

 +if (ret  0) {
 +qemu_mutex_unlock_iothread();
 +break;

There's a lot of

qemu_mutex_unlock_iothread();
break;

in this function.  Perhaps it is better if you make an invariant that
the loop is entered and exited with the BQL taken, and it is only
unlocked in the middle.  It makes sense once you fold everything in
migration.c.

It can be a separate patch though.

 +}
 +} else {
 +int old_vm_running = runstate_is_running();
 +int64_t start_time, end_time;
 +
 +DPRINTF(done iterating\n);
 +start_time = qemu_get_clock_ms(rt_clock);
 +qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 +if (old_vm_running) {
 +vm_stop(RUN_STATE_FINISH_MIGRATE);
 +} else {
 +vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 +}
 +ret = qemu_savevm_state_complete(m-file);
 +if (ret  0) {
 +qemu_mutex_unlock_iothread();
 +break;
 +} else {
 +migrate_fd_completed(m);
 +}
 +end_time = qemu_get_clock_ms(rt_clock);
 +m-total_time = end_time - m-total_time;
 +m-downtime = end_time - start_time;
 +if (m-state != MIG_STATE_COMPLETED) {
 +if (old_vm_running) {
 +vm_start();
 +}
 +}
 +last_round = true;
 +}
 +}
  qemu_mutex_unlock_iothread();
 
  if (current_time = initial_time + BUFFER_DELAY) {
 @@ -747,12 +740,6 @@ static void *buffered_file_thread(void *opaque)
  usleep((initial_time + BUFFER_DELAY - current_time)*1000);
  }
  buffered_flush(s);
 -
 -DPRINTF(file is ready\n);
 -if (s-bytes_xfer  s-xfer_limit) {
 -   

[Qemu-devel] [PATCH 21/30] migration: move exit condition to migration thread

2012-10-18 Thread Juan Quintela
Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/migration.c b/migration.c
index 8cacbc3..7206866 100644
--- a/migration.c
+++ b/migration.c
@@ -651,12 +651,6 @@ static bool migrate_fd_put_ready(MigrationState *s, 
uint64_t max_size)
 bool last_round = false;

 qemu_mutex_lock_iothread();
-if (s-state != MIG_STATE_ACTIVE) {
-DPRINTF(put_ready returning because of non-active state\n);
-qemu_mutex_unlock_iothread();
-return false;
-}
-
 DPRINTF(iterate\n);
 pending_size = qemu_savevm_state_pending(s-file, max_size);
 DPRINTF(pending size %lu max %lu\n, pending_size, max_size);
@@ -723,9 +717,18 @@ static void *buffered_file_thread(void *opaque)
 while (true) {
 int64_t current_time = qemu_get_clock_ms(rt_clock);

-if (s-migration_state-complete) {
+qemu_mutex_lock_iothread();
+if (m-state != MIG_STATE_ACTIVE) {
+DPRINTF(put_ready returning because of non-active state\n);
+qemu_mutex_unlock_iothread();
 break;
 }
+if (m-complete) {
+qemu_mutex_unlock_iothread();
+break;
+}
+qemu_mutex_unlock_iothread();
+
 if (current_time = initial_time + BUFFER_DELAY) {
 uint64_t transferred_bytes = s-bytes_xfer;
 uint64_t time_spent = current_time - initial_time;
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 1/2] Ignore RX tail kicks when RX disabled.

2012-10-18 Thread Stefan Hajnoczi
On Wed, Oct 17, 2012 at 08:31:46PM +0200, Dmitry Fleytman wrote:
 Device RX initization from driver's side consists of following steps:
   1. Initialize head and tail of RX ring to 0
   2. Enable Rx (set bit in RCTL register)
   3. Allocate buffers, fill descriptors
   4. Write ring tail
 
 Forth operation signals hardware that RX buffers available
 and it may start packets indication.
 
 Current implementation treats first operation (write 0 to ring tail)
 as signal of buffers availability and starts data transfers as soon
 as RX enable indicaton arrives.
 
 This is not correct because there is a chance that ring is still
 empty (third action not performed yet) and then memory corruption
 occures.

The existing code tries to prevent this:

e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size)
{
[...]

if (!(s-mac_reg[RCTL]  E1000_RCTL_EN))
return -1;

[...]
total_size = size + fcs_len(s);
if (!e1000_has_rxbufs(s, total_size)) {
set_ics(s, 0, E1000_ICS_RXO);
return -1;
}

Why are these checks not enough?

Which memory gets corrupted?

Stefan



[Qemu-devel] [PATCH 26/30] memory: introduce memory_region_test_and_clear_dirty

2012-10-18 Thread Juan Quintela
This function avoids having to do two calls, one to test the dirty bit, and
other to reset it.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 memory.c | 15 +++
 memory.h | 17 +
 2 files changed, 32 insertions(+)

diff --git a/memory.c b/memory.c
index 4f3ade0..126fb63 100644
--- a/memory.c
+++ b/memory.c
@@ -1080,6 +1080,21 @@ void memory_region_set_dirty(MemoryRegion *mr, 
target_phys_addr_t addr,
 return cpu_physical_memory_set_dirty_range(mr-ram_addr + addr, size, -1);
 }

+bool memory_region_test_and_clear_dirty(MemoryRegion *mr,
+target_phys_addr_t addr,
+target_phys_addr_t size, unsigned 
client)
+{
+bool ret;
+assert(mr-terminates);
+ret = cpu_physical_memory_get_dirty(mr-ram_addr + addr, size,
+1  client);
+if (ret) {
+cpu_physical_memory_set_dirty_range(mr-ram_addr + addr, size, -1);
+}
+return ret;
+}
+
+
 void memory_region_sync_dirty_bitmap(MemoryRegion *mr)
 {
 FlatRange *fr;
diff --git a/memory.h b/memory.h
index 37ce151..08af012 100644
--- a/memory.h
+++ b/memory.h
@@ -434,6 +434,23 @@ void memory_region_set_dirty(MemoryRegion *mr, 
target_phys_addr_t addr,
  target_phys_addr_t size);

 /**
+ * memory_region_test_and_clear_dirty: Check whether a range of bytes is dirty
+ * for a specified client. It clears them.
+ *
+ * Checks whether a range of bytes has been written to since the last
+ * call to memory_region_reset_dirty() with the same @client.  Dirty logging
+ * must be enabled.
+ *
+ * @mr: the memory region being queried.
+ * @addr: the address (relative to the start of the region) being queried.
+ * @size: the size of the range being queried.
+ * @client: the user of the logging information; %DIRTY_MEMORY_MIGRATION or
+ *  %DIRTY_MEMORY_VGA.
+ */
+bool memory_region_test_and_clear_dirty(MemoryRegion *mr,
+target_phys_addr_t addr,
+target_phys_addr_t size, unsigned 
client)
+/**
  * memory_region_sync_dirty_bitmap: Synchronize a region's dirty bitmap with
  *  any external TLBs (e.g. kvm)
  *
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 28/30] fix memory.c

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:30, Juan Quintela ha scritto:
 Signed-off-by: Juan Quintela quint...@redhat.com
 ---
  memory.c | 4 +++-
  memory.h | 3 ++-
  2 files changed, 5 insertions(+), 2 deletions(-)
 
 diff --git a/memory.c b/memory.c
 index 126fb63..4d0fa96 100644
 --- a/memory.c
 +++ b/memory.c
 @@ -1089,7 +1089,9 @@ bool memory_region_test_and_clear_dirty(MemoryRegion 
 *mr,
  ret = cpu_physical_memory_get_dirty(mr-ram_addr + addr, size,
  1  client);
  if (ret) {
 -cpu_physical_memory_set_dirty_range(mr-ram_addr + addr, size, -1);
 +cpu_physical_memory_reset_dirty(mr-ram_addr + addr,
 +mr-ram_addr + addr + size,
 +1  client);
  }
  return ret;
  }
 diff --git a/memory.h b/memory.h
 index 08af012..0dcc0f4 100644
 --- a/memory.h
 +++ b/memory.h
 @@ -449,7 +449,8 @@ void memory_region_set_dirty(MemoryRegion *mr, 
 target_phys_addr_t addr,
   */
  bool memory_region_test_and_clear_dirty(MemoryRegion *mr,
  target_phys_addr_t addr,
 -target_phys_addr_t size, unsigned 
 client)
 +target_phys_addr_t size,
 +unsigned client);
  /**
   * memory_region_sync_dirty_bitmap: Synchronize a region's dirty bitmap with
   *  any external TLBs (e.g. kvm)
 

This should be squashed in patch 26.

Paolo



[Qemu-devel] [PATCH 11/30] buffered_file: don't flush on put buffer

2012-10-18 Thread Juan Quintela
We call buffered_put_buffer with iothread held, and buffered_flush() does
synchronous writes.  We only want to do the synchronous writes outside.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 7692950..f508026 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -107,12 +107,6 @@ static int buffered_put_buffer(void *opaque, const uint8_t 
*buf, int64_t pos, in
 buffered_append(s, buf, size);
 }

-error = buffered_flush(s);
-if (error  0) {
-DPRINTF(buffered flush error. bailing: %s\n, strerror(-error));
-return error;
-}
-
 return size;
 }

-- 
1.7.11.7




[Qemu-devel] [PATCH 08/30] migration: remove unfreeze logic

2012-10-18 Thread Juan Quintela
Now that we have a thread, and blocking writes, we don't need it.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 24 +---
 migration.c | 23 ---
 migration.h |  1 -
 3 files changed, 1 insertion(+), 47 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 6395b37..876cc8c 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -26,7 +26,6 @@ typedef struct QEMUFileBuffered
 {
 MigrationState *migration_state;
 QEMUFile *file;
-int freeze_output;
 size_t bytes_xfer;
 size_t xfer_limit;
 uint8_t *buffer;
@@ -70,13 +69,6 @@ static ssize_t buffered_flush(QEMUFileBuffered *s)

 ret = migrate_fd_put_buffer(s-migration_state, s-buffer + offset,
 s-buffer_size - offset);
-if (ret == -EAGAIN) {
-DPRINTF(backend not ready, freezing\n);
-ret = 0;
-s-freeze_output = 1;
-break;
-}
-
 if (ret = 0) {
 DPRINTF(error flushing data, %zd\n, ret);
 break;
@@ -110,9 +102,6 @@ static int buffered_put_buffer(void *opaque, const uint8_t 
*buf, int64_t pos, in
 return error;
 }

-DPRINTF(unfreezing output\n);
-s-freeze_output = 0;
-
 if (size  0) {
 DPRINTF(buffering %d bytes\n, size - offset);
 buffered_append(s, buf, size);
@@ -126,7 +115,7 @@ static int buffered_put_buffer(void *opaque, const uint8_t 
*buf, int64_t pos, in

 if (pos == 0  size == 0) {
 DPRINTF(file is ready\n);
-if (!s-freeze_output  s-bytes_xfer  s-xfer_limit) {
+if (s-bytes_xfer  s-xfer_limit) {
 DPRINTF(notifying client\n);
 migrate_fd_put_ready(s-migration_state);
 }
@@ -149,12 +138,6 @@ static int buffered_close(void *opaque)
 if (ret  0) {
 break;
 }
-if (s-freeze_output) {
-ret = migrate_fd_wait_for_unfreeze(s-migration_state);
-if (ret  0) {
-break;
-}
-}
 }

 ret2 = migrate_fd_close(s-migration_state);
@@ -181,8 +164,6 @@ static int buffered_rate_limit(void *opaque)
 if (ret) {
 return ret;
 }
-if (s-freeze_output)
-return 1;

 if (s-bytes_xfer  s-xfer_limit)
 return 1;
@@ -227,9 +208,6 @@ static void *buffered_file_thread(void *opaque)
 if (s-migration_state-complete) {
 break;
 }
-if (s-freeze_output) {
-continue;
-}
 if (current_time = expire_time) {
 s-bytes_xfer = 0;
 expire_time = current_time + BUFFER_DELAY;
diff --git a/migration.c b/migration.c
index a8b2f4a..29ee710 100644
--- a/migration.c
+++ b/migration.c
@@ -367,29 +367,6 @@ static void migrate_fd_cancel(MigrationState *s)
 migrate_fd_cleanup(s);
 }

-int migrate_fd_wait_for_unfreeze(MigrationState *s)
-{
-int ret;
-
-DPRINTF(wait for unfreeze\n);
-if (s-state != MIG_STATE_ACTIVE)
-return -EINVAL;
-
-do {
-fd_set wfds;
-
-FD_ZERO(wfds);
-FD_SET(s-fd, wfds);
-
-ret = select(s-fd + 1, NULL, wfds, NULL, NULL);
-} while (ret == -1  (s-get_error(s)) == EINTR);
-
-if (ret == -1) {
-return -s-get_error(s);
-}
-return 0;
-}
-
 int migrate_fd_close(MigrationState *s)
 {
 return s-close(s);
diff --git a/migration.h b/migration.h
index a63c5d5..505f792 100644
--- a/migration.h
+++ b/migration.h
@@ -82,7 +82,6 @@ void migrate_fd_connect(MigrationState *s);
 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
   size_t size);
 void migrate_fd_put_ready(MigrationState *s);
-int migrate_fd_wait_for_unfreeze(MigrationState *s);
 int migrate_fd_close(MigrationState *s);

 void add_migration_state_change_notifier(Notifier *notify);
-- 
1.7.11.7




[Qemu-devel] [PATCH 04/30] buffered_file: Move from using a timer to use a thread

2012-10-18 Thread Juan Quintela
We still protect everything except the wait with the iothread lock.
But we moved from a timer to a thread.  Steps one by one.

We also need to detect when we have finished with a variable complete.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 58 +++--
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index ed92df1..4b90d54 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -18,6 +18,7 @@
 #include qemu-timer.h
 #include qemu-char.h
 #include buffered_file.h
+#include qemu-thread.h

 //#define DEBUG_BUFFERED_FILE

@@ -31,7 +32,8 @@ typedef struct QEMUFileBuffered
 uint8_t *buffer;
 size_t buffer_size;
 size_t buffer_capacity;
-QEMUTimer *timer;
+QemuThread thread;
+bool complete;
 } QEMUFileBuffered;

 #ifdef DEBUG_BUFFERED_FILE
@@ -160,11 +162,8 @@ static int buffered_close(void *opaque)
 if (ret = 0) {
 ret = ret2;
 }
-qemu_del_timer(s-timer);
-qemu_free_timer(s-timer);
-g_free(s-buffer);
-g_free(s);
-
+ret = migrate_fd_close(s-migration_state);
+s-complete = true;
 return ret;
 }

@@ -215,23 +214,38 @@ static int64_t buffered_get_rate_limit(void *opaque)
 return s-xfer_limit;
 }

-static void buffered_rate_tick(void *opaque)
+/* 10ms  xfer_limit is the limit that we should write each 10ms */
+#define BUFFER_DELAY 100
+
+static void *buffered_file_thread(void *opaque)
 {
 QEMUFileBuffered *s = opaque;
+int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;

-if (qemu_file_get_error(s-file)) {
-buffered_close(s);
-return;
-}
-
-qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 100);
-
-if (s-freeze_output)
-return;
-
-s-bytes_xfer = 0;
+while (true) {
+int64_t current_time = qemu_get_clock_ms(rt_clock);

-buffered_put_buffer(s, NULL, 0, 0);
+if (s-complete) {
+break;
+}
+if (s-freeze_output) {
+continue;
+}
+if (current_time = expire_time) {
+s-bytes_xfer = 0;
+expire_time = current_time + BUFFER_DELAY;
+}
+if (s-bytes_xfer = s-xfer_limit) {
+/* usleep expects microseconds */
+usleep((expire_time - current_time)*1000);
+}
+qemu_mutex_lock_iothread();
+buffered_put_buffer(s, NULL, 0, 0);
+qemu_mutex_unlock_iothread();
+}
+g_free(s-buffer);
+g_free(s);
+return NULL;
 }

 QEMUFile *qemu_fopen_ops_buffered(MigrationState *migration_state)
@@ -242,15 +256,15 @@ QEMUFile *qemu_fopen_ops_buffered(MigrationState 
*migration_state)

 s-migration_state = migration_state;
 s-xfer_limit = migration_state-bandwidth_limit / 10;
+s-complete = false;

 s-file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
  buffered_close, buffered_rate_limit,
  buffered_set_rate_limit,
 buffered_get_rate_limit);

-s-timer = qemu_new_timer_ms(rt_clock, buffered_rate_tick, s);
-
-qemu_mod_timer(s-timer, qemu_get_clock_ms(rt_clock) + 100);
+qemu_thread_create(s-thread, buffered_file_thread, s,
+   QEMU_THREAD_DETACHED);

 return s-file;
 }
-- 
1.7.11.7




[Qemu-devel] [PATCH 05/30] migration: make qemu_fopen_ops_buffered() return void

2012-10-18 Thread Juan Quintela
We want the file assignment to happen before the thread is created to
avoid locking, so we just do it before creating the thread.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 13 ++---
 buffered_file.h |  2 +-
 migration.c |  2 +-
 migration.h |  1 +
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 4b90d54..6395b37 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -33,7 +33,6 @@ typedef struct QEMUFileBuffered
 size_t buffer_size;
 size_t buffer_capacity;
 QemuThread thread;
-bool complete;
 } QEMUFileBuffered;

 #ifdef DEBUG_BUFFERED_FILE
@@ -163,7 +162,7 @@ static int buffered_close(void *opaque)
 ret = ret2;
 }
 ret = migrate_fd_close(s-migration_state);
-s-complete = true;
+s-migration_state-complete = true;
 return ret;
 }

@@ -225,7 +224,7 @@ static void *buffered_file_thread(void *opaque)
 while (true) {
 int64_t current_time = qemu_get_clock_ms(rt_clock);

-if (s-complete) {
+if (s-migration_state-complete) {
 break;
 }
 if (s-freeze_output) {
@@ -248,7 +247,7 @@ static void *buffered_file_thread(void *opaque)
 return NULL;
 }

-QEMUFile *qemu_fopen_ops_buffered(MigrationState *migration_state)
+void qemu_fopen_ops_buffered(MigrationState *migration_state)
 {
 QEMUFileBuffered *s;

@@ -256,15 +255,15 @@ QEMUFile *qemu_fopen_ops_buffered(MigrationState 
*migration_state)

 s-migration_state = migration_state;
 s-xfer_limit = migration_state-bandwidth_limit / 10;
-s-complete = false;
+s-migration_state-complete = false;

 s-file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
  buffered_close, buffered_rate_limit,
  buffered_set_rate_limit,
 buffered_get_rate_limit);

+migration_state-file = s-file;
+
 qemu_thread_create(s-thread, buffered_file_thread, s,
QEMU_THREAD_DETACHED);
-
-return s-file;
 }
diff --git a/buffered_file.h b/buffered_file.h
index ef010fe..8a246fd 100644
--- a/buffered_file.h
+++ b/buffered_file.h
@@ -17,6 +17,6 @@
 #include hw/hw.h
 #include migration.h

-QEMUFile *qemu_fopen_ops_buffered(MigrationState *migration_state);
+void qemu_fopen_ops_buffered(MigrationState *migration_state);

 #endif
diff --git a/migration.c b/migration.c
index 62e0304..02f4ffa 100644
--- a/migration.c
+++ b/migration.c
@@ -431,7 +431,7 @@ void migrate_fd_connect(MigrationState *s)
 int ret;

 s-state = MIG_STATE_ACTIVE;
-s-file = qemu_fopen_ops_buffered(s);
+qemu_fopen_ops_buffered(s);

 DPRINTF(beginning savevm\n);
 ret = qemu_savevm_state_begin(s-file, s-params);
diff --git a/migration.h b/migration.h
index 1c3e9b7..a63c5d5 100644
--- a/migration.h
+++ b/migration.h
@@ -45,6 +45,7 @@ struct MigrationState
 int64_t dirty_pages_rate;
 bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
 int64_t xbzrle_cache_size;
+bool complete;
 };

 void process_incoming_migration(QEMUFile *f);
-- 
1.7.11.7




[Qemu-devel] [PATCH 06/30] migration: stop all cpus correctly

2012-10-18 Thread Juan Quintela
You can only stop all cpus from the iothread or an vcpu.  As we want
to do it from the migration_thread, we need to do this dance with the
botton handlers.

This patch is a request for ideas.  I can move this function to cpus.c, but
wondered if there is an easy way of doing this?

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/migration.c b/migration.c
index 02f4ffa..05ef1a3 100644
--- a/migration.c
+++ b/migration.c
@@ -20,6 +20,7 @@
 #include sysemu.h
 #include block.h
 #include qemu_socket.h
+#include qemu-thread.h
 #include block-migration.h
 #include qmp-commands.h

@@ -322,11 +323,22 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const 
void *data,
 void migrate_fd_put_ready(MigrationState *s)
 {
 int ret;
+static bool first_time = true;

 if (s-state != MIG_STATE_ACTIVE) {
 DPRINTF(put_ready returning because of non-active state\n);
 return;
 }
+if (first_time) {
+first_time = false;
+DPRINTF(beginning savevm\n);
+ret = qemu_savevm_state_begin(s-file, s-params);
+if (ret  0) {
+DPRINTF(failed, %d\n, ret);
+migrate_fd_error(s);
+return;
+}
+}

 DPRINTF(iterate\n);
 ret = qemu_savevm_state_iterate(s-file);
@@ -339,7 +351,11 @@ void migrate_fd_put_ready(MigrationState *s)
 DPRINTF(done iterating\n);
 start_time = qemu_get_clock_ms(rt_clock);
 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+if (old_vm_running) {
+vm_stop(RUN_STATE_FINISH_MIGRATE);
+} else {
+vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+}

 if (qemu_savevm_state_complete(s-file)  0) {
 migrate_fd_error(s);
@@ -428,19 +444,8 @@ bool migration_has_failed(MigrationState *s)

 void migrate_fd_connect(MigrationState *s)
 {
-int ret;
-
 s-state = MIG_STATE_ACTIVE;
 qemu_fopen_ops_buffered(s);
-
-DPRINTF(beginning savevm\n);
-ret = qemu_savevm_state_begin(s-file, s-params);
-if (ret  0) {
-DPRINTF(failed, %d\n, ret);
-migrate_fd_error(s);
-return;
-}
-migrate_fd_put_ready(s);
 }

 static MigrationState *migrate_init(const MigrationParams *params)
-- 
1.7.11.7




[Qemu-devel] [PATCH 07/30] migration: make writes blocking

2012-10-18 Thread Juan Quintela
Move all the writes to the migration_thread, and make writings
blocking.  Notice that are still using the iothread for everything
that we do.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration-exec.c |  2 --
 migration-fd.c   |  6 --
 migration-tcp.c  | 19 ++-
 migration-unix.c |  2 --
 migration.c  | 21 -
 qemu-file.h  |  5 -
 savevm.c |  5 -
 7 files changed, 2 insertions(+), 58 deletions(-)

diff --git a/migration-exec.c b/migration-exec.c
index 6c97db9..908f22e 100644
--- a/migration-exec.c
+++ b/migration-exec.c
@@ -76,8 +76,6 @@ int exec_start_outgoing_migration(MigrationState *s, const 
char *command)
 goto err_after_open;
 }

-socket_set_nonblock(s-fd);
-
 s-opaque = qemu_popen(f, w);

 s-close = exec_close;
diff --git a/migration-fd.c b/migration-fd.c
index 7335167..aba1be7 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -81,11 +81,6 @@ int fd_start_outgoing_migration(MigrationState *s, const 
char *fdname)
 goto err_after_get_fd;
 }

-if (fcntl(s-fd, F_SETFL, O_NONBLOCK) == -1) {
-DPRINTF(Unable to set nonblocking mode on file descriptor\n);
-goto err_after_open;
-}
-
 s-get_error = fd_errno;
 s-write = fd_write;
 s-close = fd_close;
@@ -93,7 +88,6 @@ int fd_start_outgoing_migration(MigrationState *s, const char 
*fdname)
 migrate_fd_connect(s);
 return 0;

-err_after_open:
 close(s-fd);
 err_after_get_fd:
 return -1;
diff --git a/migration-tcp.c b/migration-tcp.c
index a15c2b8..812ae22 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -53,21 +53,6 @@ static int tcp_close(MigrationState *s)
 return r;
 }

-static void tcp_wait_for_connect(int fd, void *opaque)
-{
-MigrationState *s = opaque;
-
-if (fd  0) {
-DPRINTF(migrate connect error\n);
-s-fd = -1;
-migrate_fd_error(s);
-} else {
-DPRINTF(migrate connect success\n);
-s-fd = fd;
-migrate_fd_connect(s);
-}
-}
-
 int tcp_start_outgoing_migration(MigrationState *s, const char *host_port,
  Error **errp)
 {
@@ -75,12 +60,12 @@ int tcp_start_outgoing_migration(MigrationState *s, const 
char *host_port,
 s-write = socket_write;
 s-close = tcp_close;

-s-fd = inet_nonblocking_connect(host_port, tcp_wait_for_connect, s,
- errp);
+s-fd = inet_connect(host_port, errp);
 if (error_is_set(errp)) {
 migrate_fd_error(s);
 return -1;
 }
+migrate_fd_connect(s);

 return 0;
 }
diff --git a/migration-unix.c b/migration-unix.c
index 169de88..bb41bac 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -96,8 +96,6 @@ int unix_start_outgoing_migration(MigrationState *s, const 
char *path)
 return -errno;
 }

-socket_set_nonblock(s-fd);
-
 do {
 ret = connect(s-fd, (struct sockaddr *)addr, sizeof(addr));
 if (ret == -1) {
diff --git a/migration.c b/migration.c
index 05ef1a3..a8b2f4a 100644
--- a/migration.c
+++ b/migration.c
@@ -247,10 +247,6 @@ static int migrate_fd_cleanup(MigrationState *s)
 {
 int ret = 0;

-if (s-fd != -1) {
-qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL);
-}
-
 if (s-file) {
 DPRINTF(closing file\n);
 ret = qemu_fclose(s-file);
@@ -285,18 +281,6 @@ static void migrate_fd_completed(MigrationState *s)
 notifier_list_notify(migration_state_notifiers, s);
 }

-static void migrate_fd_put_notify(void *opaque)
-{
-MigrationState *s = opaque;
-int ret;
-
-qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL);
-ret = qemu_file_put_notify(s-file);
-if (ret) {
-migrate_fd_error(s);
-}
-}
-
 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
   size_t size)
 {
@@ -313,10 +297,6 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const 
void *data,
 if (ret == -1)
 ret = -(s-get_error(s));

-if (ret == -EAGAIN) {
-qemu_set_fd_handler2(s-fd, NULL, NULL, migrate_fd_put_notify, s);
-}
-
 return ret;
 }

@@ -412,7 +392,6 @@ int migrate_fd_wait_for_unfreeze(MigrationState *s)

 int migrate_fd_close(MigrationState *s)
 {
-qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL);
 return s-close(s);
 }

diff --git a/qemu-file.h b/qemu-file.h
index 9c8985b..e88892c 100644
--- a/qemu-file.h
+++ b/qemu-file.h
@@ -104,11 +104,6 @@ int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t 
new_rate);
 int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error(QEMUFile *f);

-/* Try to send any outstanding data.  This function is useful when output is
- * halted due to rate limiting or EAGAIN errors occur as it can be used to
- * resume output. */
-int qemu_file_put_notify(QEMUFile *f);
-
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
 qemu_put_be64(f, *pv);
diff --git a/savevm.c 

Re: [Qemu-devel] [PATCH 22/30] migration: unfold rest of migrate_fd_put_ready() into thread

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 10:39, Paolo Bonzini ha scritto:
 in this function.  Perhaps it is better if you make an invariant that
 the loop is entered and exited with the BQL taken, and it is only
 unlocked in the middle.  It makes sense once you fold everything in
 migration.c.

i.e. this:

diff --git a/migration.c b/migration.c
index 554d79a..bd9fe2d 100644
--- a/migration.c
+++ b/migration.c
@@ -661,23 +661,18 @@ static void *buffered_file_thread(void *opaque)
 ret = qemu_savevm_state_begin(m-file, m-params);
 if (ret  0) {
 DPRINTF(failed, %d\n, ret);
-qemu_mutex_unlock_iothread();
 goto out;
 }
-qemu_mutex_unlock_iothread();
 
 while (true) {
 int64_t current_time = qemu_get_clock_ms(rt_clock);
 uint64_t pending_size;
 
-qemu_mutex_lock_iothread();
 if (m-state != MIG_STATE_ACTIVE) {
 DPRINTF(put_ready returning because of non-active state\n);
-qemu_mutex_unlock_iothread();
 break;
 }
 if (m-complete) {
-qemu_mutex_unlock_iothread();
 break;
 }
 if (s-bytes_xfer  s-xfer_limit) {
@@ -687,7 +682,6 @@ static void *buffered_file_thread(void *opaque)
 if (pending_size  pending_size = max_size) {
 ret = qemu_savevm_state_iterate(m-file);
 if (ret  0) {
-qemu_mutex_unlock_iothread();
 break;
 }
 } else {
@@ -708,7 +702,6 @@ static void *buffered_file_thread(void *opaque)
 printf(vm_stop 2 %ld\n, end_time - start_time);
 ret = qemu_savevm_state_complete(m-file);
 if (ret  0) {
-qemu_mutex_unlock_iothread();
 break;
 } else {
 end_time = qemu_get_clock_ms(rt_clock);
@@ -729,8 +722,8 @@ static void *buffered_file_thread(void *opaque)
 last_round = true;
 }
 }
-qemu_mutex_unlock_iothread();
 
+qemu_mutex_unlock_iothread();
 if (current_time = initial_time + BUFFER_DELAY) {
 uint64_t transferred_bytes = s-bytes_xfer;
 uint64_t time_spent = current_time - initial_time;
@@ -749,9 +742,11 @@ static void *buffered_file_thread(void *opaque)
 usleep((initial_time + BUFFER_DELAY - current_time)*1000);
 }
 buffered_flush(s);
+qemu_mutex_lock_iothread();
 }
 
 out:
+qemu_mutex_unlock_iothread();
 if (ret  0) {
 migrate_fd_error(m);
 }


BTW the completion (basically the else starting at int old_vm_running =
runstate_is_running() looks much nicer in a separate function, like:

if (pending_size  pending_size = max_size) {
ret = qemu_savevm_state_iterate(m-file);
} else {
ret = migration_complete(m);
last_round = true;
}
if (ret  0) {
break;
}

Paolo



Re: [Qemu-devel] [PATCH 04/30] buffered_file: Move from using a timer to use a thread

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:30, Juan Quintela ha scritto:
 +}
 +if (s-bytes_xfer = s-xfer_limit) {
 +/* usleep expects microseconds */
 +usleep((expire_time - current_time)*1000);
 +}

g_usleep please.

Paolo



[Qemu-devel] [PATCH 12/30] buffered_file: unfold buffered_append in buffered_put_buffer

2012-10-18 Thread Juan Quintela
It was the only user, and now buffered_put_buffer just do the append

Signed-off-by: Juan Quintela quint...@redhat.com
---
 buffered_file.c | 33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index f508026..fcb3907 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -42,22 +42,6 @@ typedef struct QEMUFileBuffered
 do { } while (0)
 #endif

-static void buffered_append(QEMUFileBuffered *s,
-const uint8_t *buf, size_t size)
-{
-if (size  (s-buffer_capacity - s-buffer_size)) {
-DPRINTF(increasing buffer capacity from %zu by %zu\n,
-s-buffer_capacity, size + 1024);
-
-s-buffer_capacity += size + 1024;
-
-s-buffer = g_realloc(s-buffer, s-buffer_capacity);
-}
-
-memcpy(s-buffer + s-buffer_size, buf, size);
-s-buffer_size += size;
-}
-
 static ssize_t buffered_flush(QEMUFileBuffered *s)
 {
 size_t offset = 0;
@@ -102,11 +86,22 @@ static int buffered_put_buffer(void *opaque, const uint8_t 
*buf, int64_t pos, in
 return error;
 }

-if (size  0) {
-DPRINTF(buffering %d bytes\n, size - offset);
-buffered_append(s, buf, size);
+if (size = 0) {
+return size;
 }

+if (size  (s-buffer_capacity - s-buffer_size)) {
+DPRINTF(increasing buffer capacity from %zu by %zu\n,
+s-buffer_capacity, size + 1024);
+
+s-buffer_capacity += size + 1024;
+
+s-buffer = g_realloc(s-buffer, s-buffer_capacity);
+}
+
+memcpy(s-buffer + s-buffer_size, buf, size);
+s-buffer_size += size;
+
 return size;
 }

-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 16/30] migration: move buffered_file.c code into migration.c

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:30, Juan Quintela ha scritto:
 This only moves the code (also from buffered_file.h to migration.h).
 Fix whitespace until checkpatch is happy.

While I agree with this patch, it is also a conflict magnet.  My
migration-in-a-coroutine cleanups will touch buffered_file.c, you're
warned...

Paolo

 Signed-off-by: Juan Quintela quint...@redhat.com
 ---
  Makefile.objs   |   2 +-
  buffered_file.c | 244 
 
  buffered_file.h |  22 -
  migration.c | 218 +-
  migration.h |   1 +
  5 files changed, 219 insertions(+), 268 deletions(-)
  delete mode 100644 buffered_file.c
  delete mode 100644 buffered_file.h
 
 diff --git a/Makefile.objs b/Makefile.objs
 index 74b3542..3de8f27 100644
 --- a/Makefile.objs
 +++ b/Makefile.objs
 @@ -73,7 +73,7 @@ extra-obj-$(CONFIG_LINUX) += fsdev/
 
  common-obj-y += tcg-runtime.o host-utils.o main-loop.o
  common-obj-y += input.o
 -common-obj-y += buffered_file.o migration.o migration-tcp.o
 +common-obj-y += migration.o migration-tcp.o
  common-obj-y += qemu-char.o #aio.o
  common-obj-y += block-migration.o iohandler.o
  common-obj-y += pflib.o
 diff --git a/buffered_file.c b/buffered_file.c
 deleted file mode 100644
 index c21f847..000
 --- a/buffered_file.c
 +++ /dev/null
 @@ -1,244 +0,0 @@
 -/*
 - * QEMU buffered QEMUFile
 - *
 - * Copyright IBM, Corp. 2008
 - *
 - * Authors:
 - *  Anthony Liguori   aligu...@us.ibm.com
 - *
 - * This work is licensed under the terms of the GNU GPL, version 2.  See
 - * the COPYING file in the top-level directory.
 - *
 - * Contributions after 2012-01-13 are licensed under the terms of the
 - * GNU GPL, version 2 or (at your option) any later version.
 - */
 -
 -#include qemu-common.h
 -#include hw/hw.h
 -#include qemu-timer.h
 -#include qemu-char.h
 -#include buffered_file.h
 -#include qemu-thread.h
 -
 -//#define DEBUG_BUFFERED_FILE
 -
 -typedef struct QEMUFileBuffered
 -{
 -MigrationState *migration_state;
 -QEMUFile *file;
 -size_t bytes_xfer;
 -size_t xfer_limit;
 -uint8_t *buffer;
 -size_t buffer_size;
 -size_t buffer_capacity;
 -QemuThread thread;
 -} QEMUFileBuffered;
 -
 -#ifdef DEBUG_BUFFERED_FILE
 -#define DPRINTF(fmt, ...) \
 -do { printf(buffered-file:  fmt, ## __VA_ARGS__); } while (0)
 -#else
 -#define DPRINTF(fmt, ...) \
 -do { } while (0)
 -#endif
 -
 -static ssize_t buffered_flush(QEMUFileBuffered *s)
 -{
 -size_t offset = 0;
 -ssize_t ret = 0;
 -
 -DPRINTF(flushing %zu byte(s) of data\n, s-buffer_size);
 -
 -while (s-bytes_xfer  s-xfer_limit  offset  s-buffer_size) {
 -
 -ret = migrate_fd_put_buffer(s-migration_state, s-buffer + offset,
 -s-buffer_size - offset);
 -if (ret = 0) {
 -DPRINTF(error flushing data, %zd\n, ret);
 -break;
 -} else {
 -DPRINTF(flushed %zd byte(s)\n, ret);
 -offset += ret;
 -s-bytes_xfer += ret;
 -}
 -}
 -
 -DPRINTF(flushed %zu of %zu byte(s)\n, offset, s-buffer_size);
 -memmove(s-buffer, s-buffer + offset, s-buffer_size - offset);
 -s-buffer_size -= offset;
 -
 -if (ret  0) {
 -return ret;
 -}
 -return offset;
 -}
 -
 -static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t 
 pos, int size)
 -{
 -QEMUFileBuffered *s = opaque;
 -ssize_t error;
 -
 -DPRINTF(putting %d bytes at % PRId64 \n, size, pos);
 -
 -error = qemu_file_get_error(s-file);
 -if (error) {
 -DPRINTF(flush when error, bailing: %s\n, strerror(-error));
 -return error;
 -}
 -
 -if (size = 0) {
 -return size;
 -}
 -
 -if (size  (s-buffer_capacity - s-buffer_size)) {
 -DPRINTF(increasing buffer capacity from %zu by %zu\n,
 -s-buffer_capacity, size + 1024);
 -
 -s-buffer_capacity += size + 1024;
 -
 -s-buffer = g_realloc(s-buffer, s-buffer_capacity);
 -}
 -
 -memcpy(s-buffer + s-buffer_size, buf, size);
 -s-buffer_size += size;
 -
 -return size;
 -}
 -
 -static int buffered_close(void *opaque)
 -{
 -QEMUFileBuffered *s = opaque;
 -ssize_t ret = 0;
 -int ret2;
 -
 -DPRINTF(closing\n);
 -
 -s-xfer_limit = INT_MAX;
 -while (!qemu_file_get_error(s-file)  s-buffer_size) {
 -ret = buffered_flush(s);
 -if (ret  0) {
 -break;
 -}
 -}
 -
 -ret2 = migrate_fd_close(s-migration_state);
 -if (ret = 0) {
 -ret = ret2;
 -}
 -ret = migrate_fd_close(s-migration_state);
 -s-migration_state-complete = true;
 -return ret;
 -}
 -
 -/*
 - * The meaning of the return values is:
 - *   0: We can continue sending
 - *   1: Time to stop
 - *   negative: There has been an error
 - */
 -static int buffered_rate_limit(void *opaque)
 -{
 -QEMUFileBuffered *s = opaque;
 -int ret;
 -
 -ret 

[Qemu-devel] [PATCH 02/30] add a version number to ram_list

2012-10-18 Thread Juan Quintela
From: Umesh Deshpande udesh...@redhat.com

This will be used to detect if last_block might have become invalid
across different calls to ram_save_live.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Umesh Deshpande udesh...@redhat.com
Signed-off-by: Juan Quintela quint...@redhat.com
---
 arch_init.c | 7 ++-
 cpu-all.h   | 1 +
 exec.c  | 5 -
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 4293557..b47313d 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -336,6 +336,7 @@ static RAMBlock *last_block;
 static ram_addr_t last_offset;
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
+static uint32_t last_version;

 static inline bool migration_bitmap_test_and_reset_dirty(MemoryRegion *mr,
  ram_addr_t offset)
@@ -406,7 +407,6 @@ static void migration_bitmap_sync(void)
 }
 }

-
 /*
  * ram_save_block: Writes a page of memory to the stream f
  *
@@ -558,6 +558,7 @@ static void reset_ram_globals(void)
 {
 last_block = NULL;
 last_offset = 0;
+last_version = ram_list.version;
 sort_ram_list();
 }

@@ -613,6 +614,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 uint64_t expected_downtime;
 MigrationState *s = migrate_get_current();

+if (ram_list.version != last_version) {
+reset_ram_globals();
+}
+
 bytes_transferred_last = bytes_transferred;
 bwidth = qemu_get_clock_ns(rt_clock);

diff --git a/cpu-all.h b/cpu-all.h
index 6558a6f..e07c91c 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -500,6 +500,7 @@ typedef struct RAMBlock {

 typedef struct RAMList {
 uint8_t *phys_dirty;
+uint32_t version;
 QLIST_HEAD(, RAMBlock) blocks_mru;
 QLIST_HEAD(, RAMBlock) blocks;
 } RAMList;
diff --git a/exec.c b/exec.c
index 718bbc2..1e04711 100644
--- a/exec.c
+++ b/exec.c
@@ -637,7 +637,6 @@ bool tcg_enabled(void)
 void cpu_exec_init_all(void)
 {
 #if !defined(CONFIG_USER_ONLY)
-qemu_mutex_init(ram_list.mutex);
 memory_map_init();
 io_mem_init();
 #endif
@@ -2575,6 +2574,8 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void 
*host,
 QLIST_INSERT_HEAD(ram_list.blocks, new_block, next);
 QLIST_INSERT_HEAD(ram_list.blocks_mru, new_block, next_mru);

+ram_list.version++;
+
 ram_list.phys_dirty = g_realloc(ram_list.phys_dirty,
last_ram_offset()  TARGET_PAGE_BITS);
 memset(ram_list.phys_dirty + (new_block-offset  TARGET_PAGE_BITS),
@@ -2602,6 +2603,7 @@ void qemu_ram_free_from_ptr(ram_addr_t addr)
 if (addr == block-offset) {
 QLIST_REMOVE(block, next);
 QLIST_REMOVE(block, next_mru);
+ram_list.version++;
 g_free(block);
 return;
 }
@@ -2616,6 +2618,7 @@ void qemu_ram_free(ram_addr_t addr)
 if (addr == block-offset) {
 QLIST_REMOVE(block, next);
 QLIST_REMOVE(block, next_mru);
+ram_list.version++;
 if (block-flags  RAM_PREALLOC_MASK) {
 ;
 } else if (mem_path) {
-- 
1.7.11.7




Re: [Qemu-devel] [PATCH 00/30] Migration thread 20121017 edition

2012-10-18 Thread Paolo Bonzini
Il 18/10/2012 09:29, Juan Quintela ha scritto:
 v3:
 
 This is work in progress on top of the previous migration series just sent.
 
 - Introduces a thread for migration instead of using a timer and callback
 - remove the writting to the fd from the iothread lock
 - make the writes synchronous
 - Introduce a new pending method that returns how many bytes are pending for
   one save live section
 - last patch just shows printfs to see where the time is being spent
   on the migration complete phase.
   (yes it pollutes all uses of stop on the monitor)
 
 So far I have found that we spent a lot of time on bdrv_flush_all() It
 can take from 1ms to 600ms (yes, it is not a typo).  That dwarfs the
 migration default downtime time (30ms).
 
 Stop all vcpus:
 
 - it works now (after the changes on qemu_cpu_is_vcpu on the previous
   series) caveat is that the time that brdv_flush_all() takes is
   unpredictable.  Any silver bullets?

You could reuse the block live migration item.  In block_save_pending,
start a bdrv_aio_flush() on all block devices that have already
completed the previous one.

But that's not a regression in the migration thread, isn't it?

Paolo



[Qemu-devel] [PATCH 14/30] migration: include qemu-file.h

2012-10-18 Thread Juan Quintela
They don't use/know anything about buffered-file.

Signed-off-by: Juan Quintela quint...@redhat.com
---
 migration-exec.c | 2 +-
 migration-fd.c   | 2 +-
 migration-tcp.c  | 2 +-
 migration-unix.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration-exec.c b/migration-exec.c
index 908f22e..c451545 100644
--- a/migration-exec.c
+++ b/migration-exec.c
@@ -19,7 +19,7 @@
 #include qemu_socket.h
 #include migration.h
 #include qemu-char.h
-#include buffered_file.h
+#include qemu-file.h
 #include block.h
 #include sys/types.h
 #include sys/wait.h
diff --git a/migration-fd.c b/migration-fd.c
index aba1be7..25f0245 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -18,7 +18,7 @@
 #include migration.h
 #include monitor.h
 #include qemu-char.h
-#include buffered_file.h
+#include qemu-file.h
 #include block.h
 #include qemu_socket.h

diff --git a/migration-tcp.c b/migration-tcp.c
index 812ae22..fdf8b0f 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -17,7 +17,7 @@
 #include qemu_socket.h
 #include migration.h
 #include qemu-char.h
-#include buffered_file.h
+#include qemu-file.h
 #include block.h

 //#define DEBUG_MIGRATION_TCP
diff --git a/migration-unix.c b/migration-unix.c
index bb41bac..64bc0c3 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -17,7 +17,7 @@
 #include qemu_socket.h
 #include migration.h
 #include qemu-char.h
-#include buffered_file.h
+#include qemu-file.h
 #include block.h

 //#define DEBUG_MIGRATION_UNIX
-- 
1.7.11.7




Re: [Qemu-devel] [RFC V2 01/20] qcow2: Add deduplication to the qcow2 specification.

2012-10-18 Thread Benoît Canet
 That's still too sparse for a formal documentation - what is the format
 of the deduplication table, and what do the bits in that table imply
 with regards to how the rest of the qcow2 file is used?

I will add this in the next iteration.

Best regards

Benoît




Re: [Qemu-devel] [Patch]KVM: enabling per domain PLE

2012-10-18 Thread Hu, Xuekun

 -Original Message-
 From: Avi Kivity [mailto:a...@redhat.com]
 Sent: Wednesday, October 17, 2012 6:12 PM
 To: Hu, Xuekun
 Cc: k...@vger.kernel.org; qemu-devel@nongnu.org; Zhang, Xiantao
 Subject: Re: [Patch]KVM: enabling per domain PLE
 
 On 10/17/2012 10:02 AM, Hu, Xuekun wrote:
 
  The problem with this is that it requires an administrator to
  understand the workload, not only of the guest, but also of other guests on
 the machine.
  With low overcommit, a high PLE window reduces unneeded exits, but
  with high overcommit we need those exits to reduce spinning.
 
  In addition, most kvm hosts don't have an administrator.  They are
  controlled by a management system, which means we'll need some
  algorithm in userspace to control the PLE window.  Taking the two
  together, we need a dynamic (for changing workloads) algorithm.
 
  There are threads discussing this dynamic algorithm, we are making
  slow progress because it's such a difficult problem, but I think this
  is much more useful than anything requiring user intervention.
 
  Avi, agreed that dynamic adaptive ple should be the best solution.
  However currently it is a difficult problem like you said. Our
  solution just gives user a choice who know how to set the two PLE
  values. So the solution is a compromise solution, which should be
  better than nothing, for now? :-)
 
 Let's see how the PLE thread works out.  Yes the patches give the user 
 control,
 but we need to make sure the user knows how to control it (in fact your patch
 doesn't even update the documentation).  Just throwing out a new ioctl, even
 if it is documented, doesn't mean that userspace will begin to use it, or that
 users will exploit it.
 
 Do you have a specific use case in mind?
 

Yes, some cloud vendors already knew that different PLE values has big 
performance
impact on their applications. They want one interface for them to set. And I 
think the
big cloud vendors should have administrators that have experience on PLE 
tuning. :-) 

 --
 error compiling committee.c: too many arguments to function



[Qemu-devel] [PATCH] qmp: handle stop/cont in INMIGRATE state

2012-10-18 Thread Paolo Bonzini
Right now, stop followed by an incoming migration will let the
virtual machine start.  cont before an incoming migration instead
will fail.

This is bad because the actual behavior is not predictable; it is
racy with respect to the start of the incoming migration.  That's
because incoming migration is blocking, and thus will delay the
processing of stop/cont until the end of the migration.

In addition, there's nothing that really prevents the user from
typing the block device's passwords before incoming migration is
done, so we may as well allow that.

Both things can be fixed by just toggling the autostart variable when
stop/cont are called in INMIGRATE state.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 qmp.c | 17 +++--
 1 file modificato, 11 inserzioni(+), 6 rimozioni(-)

diff --git a/qmp.c b/qmp.c
index 36c54c5..2c8d559 100644
--- a/qmp.c
+++ b/qmp.c
@@ -85,7 +85,11 @@ void qmp_quit(Error **err)
 
 void qmp_stop(Error **errp)
 {
-vm_stop(RUN_STATE_PAUSED);
+if (runstate_check(RUN_STATE_INMIGRATE)) {
+autostart = 0;
+} else {
+vm_stop(RUN_STATE_PAUSED);
+}
 }
 
 void qmp_system_reset(Error **errp)
@@ -144,10 +148,7 @@ void qmp_cont(Error **errp)
 {
 Error *local_err = NULL;
 
-if (runstate_check(RUN_STATE_INMIGRATE)) {
-error_set(errp, QERR_MIGRATION_EXPECTED);
-return;
-} else if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
+if (runstate_check(RUN_STATE_INTERNAL_ERROR) ||
runstate_check(RUN_STATE_SHUTDOWN)) {
 error_set(errp, QERR_RESET_REQUIRED);
 return;
@@ -162,7 +163,11 @@ void qmp_cont(Error **errp)
 return;
 }
 
-vm_start();
+if (runstate_check(RUN_STATE_INMIGRATE)) {
+autostart = 1;
+} else {
+vm_start();
+}
 }
 
 void qmp_system_wakeup(Error **errp)
-- 
1.7.12.1




[Qemu-devel] [PATCH] vnc-tls: Fix compilation with newer versions of GNU-TLS

2012-10-18 Thread Andre Przywara
In my installation of GNU-TLS (v3.0.23) the type
gnutls_anon_server_credentials is marked deprecated, so -Werror
breaks compilation.
Simply replacing it with the newer ..._t version fixed the compilation
on my machine (Slackware 14.0). I cannot tell how far back this new
type goes, at least the header file in RHEL 5.0 (v1.4.1) seems to have
it already. If someone finds a broken distribution, tell me and I
insert some compat code.

Signed-off-by: Andre Przywara andre.przyw...@amd.com
---
 ui/vnc-tls.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ui/vnc-tls.c b/ui/vnc-tls.c
index a7f7d07..ba3827b 100644
--- a/ui/vnc-tls.c
+++ b/ui/vnc-tls.c
@@ -99,9 +99,9 @@ static ssize_t vnc_tls_pull(gnutls_transport_ptr_t transport,
 }
 
 
-static gnutls_anon_server_credentials vnc_tls_initialize_anon_cred(void)
+static gnutls_anon_server_credentials_t vnc_tls_initialize_anon_cred(void)
 {
-gnutls_anon_server_credentials anon_cred;
+gnutls_anon_server_credentials_t anon_cred;
 int ret;
 
 if ((ret = gnutls_anon_allocate_server_credentials(anon_cred))  0) {
@@ -382,7 +382,7 @@ int vnc_tls_client_setup(struct VncState *vs,
 }
 
 } else {
-gnutls_anon_server_credentials anon_cred = 
vnc_tls_initialize_anon_cred();
+gnutls_anon_server_credentials_t anon_cred = 
vnc_tls_initialize_anon_cred();
 if (!anon_cred) {
 gnutls_deinit(vs-tls.session);
 vs-tls.session = NULL;
-- 
1.7.12.1





Re: [Qemu-devel] [PATCH] vnc-tls: Fix compilation with newer versions of GNU-TLS

2012-10-18 Thread Gerd Hoffmann
On 10/18/12 11:16, Andre Przywara wrote:
 In my installation of GNU-TLS (v3.0.23) the type
 gnutls_anon_server_credentials is marked deprecated, so -Werror
 breaks compilation.
 Simply replacing it with the newer ..._t version fixed the compilation
 on my machine (Slackware 14.0). I cannot tell how far back this new
 type goes, at least the header file in RHEL 5.0 (v1.4.1) seems to have
 it already. If someone finds a broken distribution, tell me and I
 insert some compat code.

Acked-by: Gerd Hoffmann kra...@redhat.com

cheers,
  Gerd



Re: [Qemu-devel] [PATCH 2/3] chardev: fix QemuOpts lifecycle

2012-10-18 Thread Gerd Hoffmann
  Hi,

 @@ -2864,6 +2868,7 @@ void qemu_chr_delete(CharDriverState *chr)
  chr-chr_close(chr);
  g_free(chr-filename);
  g_free(chr-label);
 +qemu_opts_del(chr-opts);
  g_free(chr);
  }
 
 Didn't consider this in my first review, but is chardev_init_func() fine
 with this? Basically, if we chardev_remove a device created by it (is this
 possible?) then the opts will be freed but it will remain inserted in its
 QemuOptsList.

Hmm? Don't see your issue here.
qemu_opts_del will also unlink from QemuOptsList ...

cheers,
  Gerd




Re: [Qemu-devel] [RFC PATCH v3 06/19] Implement -dimm command line option

2012-10-18 Thread Vasilis Liaskovitis
On Wed, Oct 17, 2012 at 12:03:51PM +0200, Avi Kivity wrote:
 On 10/17/2012 11:19 AM, Vasilis Liaskovitis wrote:
  
  I don't think so, but probably there's a limit of DIMMs that real
  controllers have, something like 8 max.
  
  In the case of i440fx specifically, do you mean that we should model the DRB
  (Dram row boundary registers in section 3.2.19 of the i440fx spec) ?
  
  The i440fx DRB registers only supports up to 8 DRAM rows (let's say 1 row
  maps 1-1 to a DimmDevice for this discussion) and only supports up to 2GB of
  memory afaict (bit 31 and above is ignored).
  
  I 'd rather not model this part of the i440fx - having only 8 DIMMs seems 
  too
  restrictive. The rest of the patchset supports up to 255 DIMMs so it would 
  be a
  waste imho to model an old pc memory controller that only supports 8 DIMMs.
  
  There was also an old discussion about i440fx modeling here:
  https://lists.nongnu.org/archive/html/qemu-devel/2011-07/msg02705.html
  the general direction was that i440fx is too old and we don't want to 
  precisely
  emulate the DRB registers, since they lack flexibility.
  
  Possible solutions:
  
  1) is there a newer and more flexible chipset that we could model?
 
 Look for q35 on this list.

thanks, I 'll take a look. It sounds like the other options below are more
straightforward now, but let me know if you prefer q35 integration as a 
priority.

 
  
  2) model and document 
  ^--- the critical bit
 
  a generic (non-existent) i440fx that would support more
  and larger DIMMs. E.g. support 255 DIMMs. If we want to use a description
  similar to the i440fx DRB registers, the registers would take up a lot of 
  space.
  In i440fx there is one 8-bit DRB register per DIMM, and DRB[i] describes how
  many 8MB chunks are contained in DIMMs 0...i. So, the register values are
  cumulative (and total described memory cannot exceed 256x8MB = 2GB)
 
 Our i440fx has already been extended by support for pci and cpu hotplug,
 and I see no reason not to extend it for memory.  We can allocate extra
 mmio space for registers if needed.  Usually I'm against this sort of
 thing, but in this case we don't have much choice.

ok

 
  
  We could for example model: 
  - an 8-bit non-cumulative register for each DIMM, denoting how many
  128MB chunks it contains. This allowes 32GB for each DIMM, and with 255 
  DIMMs we
  describe a bit less than 8TB. These registers require 255 bytes.
  - a 16-bit cumulative register for each DIMM again for 128MB chunks. This 
  allows
  us to describe 8TB of memory (but the registers take up double the space, 
  because
  they describe cumulative memory amounts)
 
 There is no reason to save space.  Why not have two 64-bit registers per
 DIMM, one describing the size and the other the base address, both in
 bytes?  Use a few low order bits for control.

Do we want this generic scheme above to be tied into the i440fx/pc machine?
Or have it as a separate generic memory bus / pmc usable by others (e.g. in
hw/dimm.c)?
The 64-bit values you describe are already part of DimmDevice properties, but
they are not hardware registers described as part of a chipset.

In terms of control bits, did you want to mimic some other chipset registers? - 
any examples would be useful.

 
  
  3) let everything be handled/abstracted by dimmbus - the chipset DRB 
  modelling
  is not done (at least for i440fx, other machines could). This is the least 
  precise
  in terms of emulation. On the other hand, if we are not really trying to 
  emulate
  the real (too restrictive) hardware, does it matter?
 
 We could emulate base memory using the chipset, and extra memory using
 the scheme above.  This allows guests that are tied to the chipset to
 work, and guests that have more awareness (seabios) to use the extra
 features.

But if we use the real i440fx pmc DRBs for base memory, this means base memory
would be = 2GB, right?

Sounds like we 'd need to change the DRBs anyway to describe useful amounts of
base memory (e.g. 512MB chunks and check against address lines [36:29] can
describe base memory up to 64GB, though that's still limiting for very large
VMs). But we'd be diverting from the real hardware again.

Then we can model base memory with tweaked i440fx pmc's DRB registers - we
could only use DRB[0] (one DIMM describing all of base memory) or more.

DIMMs would be allowed to be hotplugged in the generic mem-controller scheme 
only
(unless it makes sense to allow hotplug in the remaining pmc DRBs and
start using the generic scheme once we run out of emulated DRBs)

thanks,

- Vasilis



Re: [Qemu-devel] [PATCH v3] qemu-img: document 'info --backing-chain'

2012-10-18 Thread Kevin Wolf
Am 18.10.2012 07:55, schrieb Kashyap Chamarthy:
 Signed-off-by: Kashyap Chamarthy kashyap...@gmail.com
 ---
  qemu-img-cmds.hx |  4 ++--
  qemu-img.texi| 21 -
  2 files changed, 22 insertions(+), 3 deletions(-)

Thanks, applied to the block branch.

Kevin



Re: [Qemu-devel] [PATCH] block: bdrv_create(): don't leak cco.filename on error

2012-10-18 Thread Kevin Wolf
Am 17.10.2012 21:45, schrieb Luiz Capitulino:
 
 Signed-off-by: Luiz Capitulino lcapitul...@redhat.com

Thanks, applied to the block branch.

Kevin



[Qemu-devel] [PATCH V13 2/6] make path_has_protocol non static

2012-10-18 Thread Dong Xu Wang
We will use path_has_protocol outside block.c, so just make it public.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
Reviewed-by: Michael Roth mdr...@linux.vnet.ibm.com
---
 block.c |2 +-
 block.h |1 +
 2 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/block.c b/block.c
index f639655..03ba485 100644
--- a/block.c
+++ b/block.c
@@ -198,7 +198,7 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
 }
 
 /* check if the path starts with protocol: */
-static int path_has_protocol(const char *path)
+int path_has_protocol(const char *path)
 {
 const char *p;
 
diff --git a/block.h b/block.h
index 7842d85..364ba04 100644
--- a/block.h
+++ b/block.h
@@ -329,6 +329,7 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, 
QEMUSnapshotInfo *sn);
 
 char *get_human_readable_size(char *buf, int buf_size, int64_t size);
 int path_is_absolute(const char *path);
+int path_has_protocol(const char *path);
 void path_combine(char *dest, int dest_size,
   const char *base_path,
   const char *filename);
-- 
1.7.1




[Qemu-devel] [PATCH V13 0/6] add-cow file format

2012-10-18 Thread Dong Xu Wang
It will introduce a new file format: add-cow.

The add-cow file format makes it possible to perform copy-on-write on top of
a raw disk image.  When we know that no backing file clusters remain visible
(e.g. we have streamed the entire image and copied all data from the backing
file), then it is possible to discard the add-cow file and use the raw image
file directly.

This feature adds the copy-on-write feature to raw files (which cannot support
it natively) while allowing us to get full performance again later when we no
longer need copy-on-write.

add-cow can benefit from other available functions, such as path_has_protocol
and qed_read_string, so we will make them public.

snapshot_blkdev are not supported now for add-cow. Will add it in futher 
patches.

These patches are using QemuOpts parser, former patches could be found here:
http://patchwork.ozlabs.org/patch/191347/


v12-v13:
1) Use QemuOpts, not QEMUOptionParameter
2) cluster_size configuable
3) Refactor block-cache.c
4) Correct qemu-iotests script.
5) Other bug fix.

v11-v12:
1) Removed un-used feature bit.
2) Share cache code with qcow2.c.
3) Remove snapshot_blkdev support, will add it in another patch.
5) COW Bitmap field in add-cow file will be multiple of 65536.
6) fix grammer and typo.

Dong Xu Wang (6):
  docs: document for add-cow file format
  make path_has_protocol non static
  qed_read_string to bdrv_read_string
  rename qcow2-cache.c to block-cache.c
  add-cow file format core code.
  qemu-iotests: add add-cow iotests support.

 block.c  |   29 ++-
 block.h  |3 +
 block/Makefile.objs  |4 +-
 block/add-cow.c  |  693 ++
 block/add-cow.h  |   85 +
 block/block-cache.c  |  321 +++
 block/block-cache.h  |   77 +
 block/qcow2-cache.c  |  323 
 block/qcow2-cluster.c|   54 ++--
 block/qcow2-refcount.c   |   67 +++--
 block/qcow2.c|   21 +-
 block/qcow2.h|   24 +--
 block/qed.c  |   34 +--
 block_int.h  |2 +
 docs/specs/add-cow.txt   |  139 +
 tests/qemu-iotests/017   |2 +-
 tests/qemu-iotests/020   |2 +-
 tests/qemu-iotests/common|6 +
 tests/qemu-iotests/common.rc |   15 +-
 trace-events |   13 +-
 20 files changed, 1465 insertions(+), 449 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 block/add-cow.h
 create mode 100644 block/block-cache.c
 create mode 100644 block/block-cache.h
 delete mode 100644 block/qcow2-cache.c
 create mode 100644 docs/specs/add-cow.txt




[Qemu-devel] [PATCH V13 1/6] docs: document for add-cow file format

2012-10-18 Thread Dong Xu Wang
Document for add-cow format, the usage and spec of add-cow are introduced.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 docs/specs/add-cow.txt |  139 
 1 files changed, 139 insertions(+), 0 deletions(-)
 create mode 100644 docs/specs/add-cow.txt

diff --git a/docs/specs/add-cow.txt b/docs/specs/add-cow.txt
new file mode 100644
index 000..dc1e107
--- /dev/null
+++ b/docs/specs/add-cow.txt
@@ -0,0 +1,139 @@
+== General ==
+
+The raw file format does not support backing files or copy on write feature.
+The add-cow image format makes it possible to use backing files with raw
+image by keeping a separate .add-cow metadata file. Once all sectors
+have been written into the raw image it is safe to discard the .add-cow
+and backing files, then we can use the raw image directly.
+
+An example usage of add-cow would look like::
+(ubuntu.img is a disk image which has been installed OS.)
+1)  Create a raw image with the same size of ubuntu.img
+qemu-img create -f raw test.raw 8G
+2)  Create an add-cow image which will store dirty bitmap
+qemu-img create -f add-cow test.add-cow \
+-o backing_file=ubuntu.img,image_file=test.raw
+3)  Run qemu with add-cow image
+qemu -drive if=virtio,file=test.add-cow
+
+test.raw may be larger than ubuntu.img, in that case, the size of test.add-cow
+will be calculated from the size of test.raw.
+
+=Specification=
+
+The file format looks like this:
+
+ +---+-+-+
+ | Header|   Reserved  |COW bitmap   |
+ +---+-+-+
+
+All numbers in add-cow are stored in Little Endian byte order.
+
+== Header ==
+
+The Header is included in the first bytes:
+(#define HEADER_SIZE (4096 * header_size))
+Byte0 -  7: magic
+add-cow magic string (ADD_COW\xff).
+
+8 -  11:version
+Version number (only valid value is 1 now).
+
+12 - 15:backing file name offset
+Offset in the add-cow file at which the backing file
+name is stored (NB: The string is not nul-terminated).
+If backing file name does NOT exist, this field will be
+0. Must be between 80 and [HEADER_SIZE - 2](a file name
+must be at least 1 byte).
+
+16 - 19:backing file name size
+Length of the backing file name in bytes. It will be 0
+if the backing file name offset is 0. If backing file
+name offset is non-zero, then it must be non-zero. Must
+be less than [HEADER_SIZE - 80] to fit in the reserved
+part of the header.
+
+20 - 23:image file name offset
+Offset in the add-cow file at which the image file name
+is stored (NB: The string is not null terminated). It
+must be between 80 and [HEADER_SIZE - 2].
+
+24 - 27:image file name size
+Length of the image file name in bytes.
+Must be less than [HEADER_SIZE - 80] to fit in the 
reserved
+part of the header.
+
+28 - 31:cluster bits
+Number of bits that are used for addressing an offset
+within a cluster (1  cluster_bits is the cluster 
size).
+Must not be less than 9 (i.e. 512 byte clusters).
+
+Note: qemu as of today has an implementation limit of 
2 MB
+as the maximum cluster size and won't be able to open 
images
+with larger cluster sizes.
+
+32 - 39:features
+Bitmask of features. An implementation can safely 
ignore
+any unknown bits that are set.
+
+Bit 0:  All allocated bit.  If this bit is set then
+backing file and COW bitmap will not be 
used,
+and can read from or write to image file 
directly.
+
+Bits 1-63:  Reserved (set to 0)
+
+40 - 47:optional features
+Not used now. Reserved for future use. It must be set 
to 0.
+And must be ignored while reading.
+
+48 - 51:header size
+The header field is variable-sized. This field 
indicates
+how many 4096 bytes will be used to store add-cow 
header.
+In add-cow v1, it is fixed to 1, so the header size 
will
+be 4096 * 1 = 4096 bytes.
+
+52 - 67:backing file format
+ 

[Qemu-devel] [PATCH V13 4/6] rename qcow2-cache.c to block-cache.c

2012-10-18 Thread Dong Xu Wang
We will re-use qcow2-cache as block layer common cache code,
so change its name and made some changes, define a struct named
BlockTableType, pass BlockTableType and table size parameters to
block cache initialization function.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 block/Makefile.objs|3 +-
 block/block-cache.c|  317 +++
 block/block-cache.h|   76 +++
 block/qcow2-cache.c|  323 
 block/qcow2-cluster.c  |   54 +
 block/qcow2-refcount.c |   67 ++-
 block/qcow2.c  |   21 ++--
 block/qcow2.h  |   24 +---
 trace-events   |   13 +-
 9 files changed, 483 insertions(+), 415 deletions(-)
 create mode 100644 block/block-cache.c
 create mode 100644 block/block-cache.h
 delete mode 100644 block/qcow2-cache.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 554f429..f128b78 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,6 @@
 block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
-block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o
+block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
+block-obj-y += block-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
diff --git a/block/block-cache.c b/block/block-cache.c
new file mode 100644
index 000..bf5c57c
--- /dev/null
+++ b/block/block-cache.c
@@ -0,0 +1,317 @@
+/*
+ * QEMU Block Layer Cache
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang wdon...@linux.vnet.ibm.com
+ *
+ * This file is based on qcow2-cache.c, see its copyrights below:
+ *
+ * L2/refcount table cache for the QCOW2 format
+ *
+ * Copyright (c) 2010 Kevin Wolf kw...@redhat.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include block_int.h
+#include qemu-common.h
+#include trace.h
+#include block-cache.h
+
+BlockCache *block_cache_create(BlockDriverState *bs, int num_tables,
+   size_t cluster_size, BlockTableType type)
+{
+BlockCache *c;
+int i;
+
+c = g_malloc0(sizeof(*c));
+c-size = num_tables;
+c-entries = g_malloc0(sizeof(*c-entries) * num_tables);
+c-table_type = type;
+c-cluster_size = cluster_size;
+
+for (i = 0; i  c-size; i++) {
+c-entries[i].table = qemu_blockalign(bs, cluster_size);
+}
+
+return c;
+}
+
+int block_cache_destroy(BlockDriverState *bs, BlockCache *c)
+{
+int i;
+
+for (i = 0; i  c-size; i++) {
+assert(c-entries[i].ref == 0);
+qemu_vfree(c-entries[i].table);
+}
+
+g_free(c-entries);
+g_free(c);
+
+return 0;
+}
+
+static int block_cache_flush_dependency(BlockDriverState *bs, BlockCache *c)
+{
+int ret;
+
+ret = block_cache_flush(bs, c-depends);
+if (ret  0) {
+return ret;
+}
+
+c-depends = NULL;
+c-depends_on_flush = false;
+
+return 0;
+}
+
+static int block_cache_entry_flush(BlockDriverState *bs, BlockCache *c, int i)
+{
+int ret = 0;
+
+if (!c-entries[i].dirty || !c-entries[i].offset) {
+return 0;
+}
+
+trace_block_cache_entry_flush(qemu_coroutine_self(), c-table_type, i);
+
+if (c-depends) {
+ret = block_cache_flush_dependency(bs, c);
+} else if (c-depends_on_flush) {
+ret = bdrv_flush(bs-file);
+if (ret = 0) {
+c-depends_on_flush = false;
+}
+}
+
+if (ret  0) {
+return ret;
+}
+
+if (c-table_type == BLOCK_TABLE_REF) {
+BLKDBG_EVENT(bs-file, BLKDBG_REFBLOCK_UPDATE_PART);
+} else if (c-table_type == BLOCK_TABLE_L2) {
+BLKDBG_EVENT(bs-file, BLKDBG_L2_UPDATE);
+}
+
+ret = bdrv_pwrite(bs-file, c-entries[i].offset,
+   

[Qemu-devel] [PATCH V13 6/6] qemu-iotests: add add-cow iotests support.

2012-10-18 Thread Dong Xu Wang
This patch will use qemu-iotests to test add-cow file format.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 tests/qemu-iotests/017   |2 +-
 tests/qemu-iotests/020   |2 +-
 tests/qemu-iotests/common|6 ++
 tests/qemu-iotests/common.rc |   15 ++-
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/017 b/tests/qemu-iotests/017
index 66951eb..d31432f 100755
--- a/tests/qemu-iotests/017
+++ b/tests/qemu-iotests/017
@@ -40,7 +40,7 @@ trap _cleanup; exit \$status 0 1 2 3 15
 . ./common.pattern
 
 # Any format supporting backing files
-_supported_fmt qcow qcow2 vmdk qed
+_supported_fmt qcow qcow2 vmdk qed add-cow
 _supported_proto generic
 _supported_os Linux
 
diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020
index 2fb0ff8..3dbb495 100755
--- a/tests/qemu-iotests/020
+++ b/tests/qemu-iotests/020
@@ -42,7 +42,7 @@ trap _cleanup; exit \$status 0 1 2 3 15
 . ./common.pattern
 
 # Any format supporting backing files
-_supported_fmt qcow qcow2 vmdk qed
+_supported_fmt qcow qcow2 vmdk qed add-cow
 _supported_proto generic
 _supported_os Linux
 
diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index 1f6fdf5..4c06895 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -128,6 +128,7 @@ common options
 check options
 -rawtest raw (default)
 -cowtest cow
+-add-cowtest add-cow
 -qcow   test qcow
 -qcow2  test qcow2
 -qedtest qed
@@ -163,6 +164,11 @@ testlist options
xpand=false
;;
 
+   -add-cow)
+IMGFMT=add-cow
+xpand=false
+;;
+
-qcow)
IMGFMT=qcow
xpand=false
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index d534e94..f48b02a 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -97,6 +97,16 @@ _make_test_img()
 fi
 if [ \( $IMGFMT = qcow2 -o $IMGFMT = qed \) -a -n $CLUSTER_SIZE 
]; then
 optstr=$(_optstr_add $optstr cluster_size=$CLUSTER_SIZE)
+elif [ $IMGFMT = add-cow ]; then
+local IMG=$TEST_IMG.raw
+if [ $1 = -b ]; then
+IMG=$IMG.b
+$QEMU_IMG create -f raw $IMG $image_size/dev/null
+extra_img_options=-o image_file=$IMG $extra_img_options
+else
+$QEMU_IMG create -f raw $IMG $image_size/dev/null
+extra_img_options=-o image_file=$IMG
+fi
 fi
 
 if [ -n $optstr ]; then
@@ -114,7 +124,8 @@ _make_test_img()
 -e s# compat='[^']*'##g \
 -e s# compat6=\\(on\\|off\\)##g \
 -e s# static=\\(on\\|off\\)##g \
--e s# lazy_refcounts=\\(on\\|off\\)##g
+-e s# lazy_refcounts=\\(on\\|off\\)##g \
+-e s# image_file='[^']*'##g
 }
 
 _cleanup_test_img()
@@ -125,6 +136,8 @@ _cleanup_test_img()
 rm -f $TEST_DIR/t.$IMGFMT
 rm -f $TEST_DIR/t.$IMGFMT.orig
 rm -f $TEST_DIR/t.$IMGFMT.base
+rm -f $TEST_DIR/t.$IMGFMT.raw
+rm -f $TEST_DIR/t.$IMGFMT.raw.b
 ;;
 
 rbd)
-- 
1.7.1




[Qemu-devel] [PATCH V13 5/6] add-cow file format core code.

2012-10-18 Thread Dong Xu Wang
add-cow file format core code. It use block-cache.c as cache code.
It lacks of snapshot_blkdev support.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 block/Makefile.objs |1 +
 block/add-cow.c |  693 +++
 block/add-cow.h |   85 +++
 block/block-cache.c |4 +
 block/block-cache.h |1 +
 block_int.h |2 +
 6 files changed, 786 insertions(+), 0 deletions(-)
 create mode 100644 block/add-cow.c
 create mode 100644 block/add-cow.h

diff --git a/block/Makefile.objs b/block/Makefile.objs
index f128b78..ed9222d 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,5 +1,6 @@
 block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o 
vvfat.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
+block-obj-y += add-cow.o
 block-obj-y += block-cache.o
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
diff --git a/block/add-cow.c b/block/add-cow.c
new file mode 100644
index 000..15c86ab
--- /dev/null
+++ b/block/add-cow.c
@@ -0,0 +1,693 @@
+/*
+ * QEMU ADD-COW Disk Format
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ *  Dong Xu Wang wdon...@linux.vnet.ibm.com
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include qemu-common.h
+#include block_int.h
+#include module.h
+#include add-cow.h
+
+static void add_cow_header_le_to_cpu(const AddCowHeader *le, AddCowHeader *cpu)
+{
+cpu-magic  = le64_to_cpu(le-magic);
+cpu-version= le32_to_cpu(le-version);
+
+cpu-backing_filename_offset= le32_to_cpu(le-backing_filename_offset);
+cpu-backing_filename_size  = le32_to_cpu(le-backing_filename_size);
+
+cpu-image_filename_offset  = le32_to_cpu(le-image_filename_offset);
+cpu-image_filename_size= le32_to_cpu(le-image_filename_size);
+
+cpu-cluster_bits   = le32_to_cpu(le-cluster_bits);
+cpu-features   = le64_to_cpu(le-features);
+cpu-optional_features  = le64_to_cpu(le-optional_features);
+cpu-header_pages_size  = le32_to_cpu(le-header_pages_size);
+
+memcpy(cpu-backing_fmt, le-backing_fmt, sizeof(cpu-backing_fmt));
+memcpy(cpu-image_fmt, le-image_fmt, sizeof(cpu-image_fmt));
+}
+
+static void add_cow_header_cpu_to_le(const AddCowHeader *cpu, AddCowHeader *le)
+{
+le-magic   = cpu_to_le64(cpu-magic);
+le-version = cpu_to_le32(cpu-version);
+
+le-backing_filename_offset = 
cpu_to_le32(cpu-backing_filename_offset);
+le-backing_filename_size   = cpu_to_le32(cpu-backing_filename_size);
+
+le-image_filename_offset   = cpu_to_le32(cpu-image_filename_offset);
+le-image_filename_size = cpu_to_le32(cpu-image_filename_size);
+
+le-cluster_bits= cpu_to_le32(cpu-cluster_bits);
+le-features= cpu_to_le64(cpu-features);
+le-optional_features   = cpu_to_le64(cpu-optional_features);
+le-header_pages_size   = cpu_to_le32(cpu-header_pages_size);
+memcpy(le-backing_fmt, cpu-backing_fmt, sizeof(cpu-backing_fmt));
+memcpy(le-image_fmt, cpu-image_fmt, sizeof(cpu-image_fmt));
+}
+
+static int add_cow_probe(const uint8_t *buf, int buf_size, const char 
*filename)
+{
+const AddCowHeader *header = (const AddCowHeader *)buf;
+
+if (le64_to_cpu(header-magic) == ADD_COW_MAGIC 
+le32_to_cpu(header-version) == ADD_COW_VERSION) {
+return 100;
+} else {
+return 0;
+}
+}
+
+static int add_cow_create(const char *filename, QemuOpts *opts)
+{
+AddCowHeader header = {
+.magic = ADD_COW_MAGIC,
+.version = ADD_COW_VERSION,
+.features = 0,
+.optional_features = 0,
+.header_pages_size = ADD_COW_DEFAULT_PAGE_SIZE,
+};
+AddCowHeader le_header;
+int64_t image_len = 0;
+const char *backing_filename = NULL;
+const char *backing_fmt = NULL;
+const char *image_filename = NULL;
+const char *image_format = NULL;
+BlockDriverState *bs, *image_bs = NULL, *backing_bs = NULL;
+BlockDriver *drv = bdrv_find_format(add-cow);
+BDRVAddCowState s;
+size_t cluster_size;
+int ret;
+
+image_len = qemu_opt_get_number(opts, BLOCK_OPT_SIZE, 0);
+backing_filename = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
+backing_fmt = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
+image_filename = qemu_opt_get(opts, BLOCK_OPT_IMAGE_FILE);
+image_format = qemu_opt_get(opts, BLOCK_OPT_IMAGE_FMT);
+cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
+ ADD_COW_CLUSTER_SIZE);
+
+header.cluster_bits = ffs(cluster_size) - 1;
+if (header.cluster_bits  MIN_CLUSTER_BITS ||
+header.cluster_bits  

Re: [Qemu-devel] [PATCH 3/3] chardev: add hotplug support.

2012-10-18 Thread Gerd Hoffmann
On 10/17/12 12:09, Gerd Hoffmann wrote:
 This patch adds chardev_add and chardev_remove monitor commands.
 
 They work similar to the netdev_{add,del} commands.  The hmp version of
 chardev_add accepts like the -chardev command line option does.  The qmp
 version expects the arguments being passed as named parameters.

Trying another approach, see attached patch.  This adds backend-specific
qemu commands to add chardevs, with just the parameters needed for the
specific backend.  Starting with file and tty, both accepting a path.
'file' is nice for testing, 'tty' very useful when hotplugging serial
devices on the host.  Others can be added as needed.  We probably don't
need all of them.  For example hotplugging the 'stdio' chardev doesn't
make much sense.

Advantage #1: Cleaner API.
Advantage #2: No legacy syntax headache when qomifying chardevs.

Comments?

cheers,
  Gerd
From 7f80af55530c9a448e4aed5a41c76a8e9514a27c Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann kra...@redhat.com
Date: Thu, 11 Oct 2012 14:53:00 +0200
Subject: [PATCH] chardev: add hotplug support.

This patch adds chardev_add_file, chardev_add_tty and chardev_remove
monitor commands.

chardev_add_file and chardev_add_tty expect an id and a path, they
create a file/tty chardev.

chardev_del just takes an id argument and zaps the chardev specified.

Signed-off-by: Gerd Hoffmann kra...@redhat.com
---
 hmp-commands.hx  |   44 +++
 hmp.c|   29 
 hmp.h|3 ++
 qapi-schema.json |   43 ++
 qemu-char.c  |   41 +
 qemu-char.h  |2 +
 qmp-commands.hx  |   76 ++
 7 files changed, 238 insertions(+), 0 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index e0b537d..ecfc497 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1404,6 +1404,50 @@ passed since 1970, i.e. unix epoch.
 ETEXI
 
 {
+.name   = chardev_add_file,
+.args_type  = id:s,path:s,
+.params = id path ,
+.help   = add file chardev,
+.mhandler.cmd = hmp_chardev_add_file,
+},
+
+STEXI
+@item chardev_add_file id path
+@findex chardev_add_file
+
+ETEXI
+
+{
+.name   = chardev_add_tty,
+.args_type  = id:s,path:s,
+.params = id path ,
+.help   = add tty chardev,
+.mhandler.cmd = hmp_chardev_add_tty,
+},
+
+STEXI
+@item chardev_add_tty id path
+@findex chardev_add_tty
+
+ETEXI
+
+{
+.name   = chardev_remove,
+.args_type  = id:s,
+.params = id,
+.help   = remove chardev,
+.mhandler.cmd = hmp_chardev_remove,
+},
+
+STEXI
+@item chardev_remove id
+@findex chardev_remove
+
+Removes the chardev @var{id}.
+
+ETEXI
+
+{
 .name   = info,
 .args_type  = item:s?,
 .params = [subcommand],
diff --git a/hmp.c b/hmp.c
index 70bdec2..346dd29 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1209,3 +1209,32 @@ void hmp_screen_dump(Monitor *mon, const QDict *qdict)
 qmp_screendump(filename, err);
 hmp_handle_error(mon, err);
 }
+
+static void hmp_chardev_add_path(Monitor *mon, const QDict *qdict,
+ const char *backend)
+{
+const char *id   = qdict_get_str(qdict, args);
+const char *path = qdict_get_str(qdict, path);
+Error *local_err = NULL;
+
+qmp_chardev_add_path(id, path, backend, local_err);
+hmp_handle_error(mon, local_err);
+}
+
+void hmp_chardev_add_file(Monitor *mon, const QDict *qdict)
+{
+hmp_chardev_add_path(mon, qdict, file);
+}
+
+void hmp_chardev_add_tty(Monitor *mon, const QDict *qdict)
+{
+hmp_chardev_add_path(mon, qdict, tty);
+}
+
+void hmp_chardev_remove(Monitor *mon, const QDict *qdict)
+{
+Error *local_err = NULL;
+
+qmp_chardev_remove(qdict_get_str(qdict, id), local_err);
+hmp_handle_error(mon, local_err);
+}
diff --git a/hmp.h b/hmp.h
index 71ea384..107941d 100644
--- a/hmp.h
+++ b/hmp.h
@@ -75,5 +75,8 @@ void hmp_getfd(Monitor *mon, const QDict *qdict);
 void hmp_closefd(Monitor *mon, const QDict *qdict);
 void hmp_send_key(Monitor *mon, const QDict *qdict);
 void hmp_screen_dump(Monitor *mon, const QDict *qdict);
+void hmp_chardev_add_file(Monitor *mon, const QDict *qdict);
+void hmp_chardev_add_tty(Monitor *mon, const QDict *qdict);
+void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/qapi-schema.json b/qapi-schema.json
index f9dbdae..76e765b 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2796,3 +2796,46 @@
 # Since: 0.14.0
 ##
 { 'command': 'screendump', 'data': {'filename': 'str'} }
+
+##
+# @chardev-add-file:
+#
+# Add a file chardev
+#
+# @id: the chardev's ID, must be unique
+# @path: file path
+#
+# Returns: Nothing on success
+#
+# Since: 1.3.0
+##
+{ 'command': 'chardev-add-file', 'data': {'id'   : 'str',
+  'path' : 'str' 

[Qemu-devel] [PATCH V13 3/6] qed_read_string to bdrv_read_string

2012-10-18 Thread Dong Xu Wang
Make qed_read_string function to a common interface, so move it to block.c.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 block.c |   27 +++
 block.h |2 ++
 block/qed.c |   34 --
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/block.c b/block.c
index 03ba485..9afb7b5 100644
--- a/block.c
+++ b/block.c
@@ -215,6 +215,33 @@ int path_has_protocol(const char *path)
 return *p == ':';
 }
 
+/**
+ * Read a string of known length from the image file
+ *
+ * @bs: Image file
+ * @offset: File offset to start of string, in bytes
+ * @n:  String length in bytes
+ * @buf:Destination buffer
+ * @buflen: Destination buffer length in bytes
+ * @ret:0 on success, -errno on failure
+ *
+ * The string is NUL-terminated.
+ */
+int bdrv_read_string(BlockDriverState *bs, uint64_t offset, size_t n,
+ char *buf, size_t buflen)
+{
+int ret;
+if (n = buflen) {
+return -EINVAL;
+}
+ret = bdrv_pread(bs, offset, buf, n);
+if (ret  0) {
+return ret;
+}
+buf[n] = '\0';
+return 0;
+}
+
 int path_is_absolute(const char *path)
 {
 #ifdef _WIN32
diff --git a/block.h b/block.h
index 364ba04..166e00c 100644
--- a/block.h
+++ b/block.h
@@ -168,6 +168,8 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
 const void *buf, int count);
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov);
+int bdrv_read_string(BlockDriverState *bs, uint64_t offset, size_t n,
+ char *buf, size_t buflen);
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
diff --git a/block/qed.c b/block/qed.c
index 0a9dbe8..096de21 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -217,33 +217,6 @@ static bool qed_is_image_size_valid(uint64_t image_size, 
uint32_t cluster_size,
 }
 
 /**
- * Read a string of known length from the image file
- *
- * @file:   Image file
- * @offset: File offset to start of string, in bytes
- * @n:  String length in bytes
- * @buf:Destination buffer
- * @buflen: Destination buffer length in bytes
- * @ret:0 on success, -errno on failure
- *
- * The string is NUL-terminated.
- */
-static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n,
-   char *buf, size_t buflen)
-{
-int ret;
-if (n = buflen) {
-return -EINVAL;
-}
-ret = bdrv_pread(file, offset, buf, n);
-if (ret  0) {
-return ret;
-}
-buf[n] = '\0';
-return 0;
-}
-
-/**
  * Allocate new clusters
  *
  * @s:  QED state
@@ -437,9 +410,10 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
 return -EINVAL;
 }
 
-ret = qed_read_string(bs-file, s-header.backing_filename_offset,
-  s-header.backing_filename_size, 
bs-backing_file,
-  sizeof(bs-backing_file));
+ret = bdrv_read_string(bs-file, s-header.backing_filename_offset,
+   s-header.backing_filename_size,
+   bs-backing_file,
+   sizeof(bs-backing_file));
 if (ret  0) {
 return ret;
 }
-- 
1.7.1




[Qemu-devel] [PATCH 0/9] Incoming migration coroutine

2012-10-18 Thread Paolo Bonzini
Juan,

here is the implementation of migration-in-a-coroutine, applying on top
of your pull request.  With these patches, the monitor (and in the future
the NBD server) is responsive during migration.

The first ten patches are just cleanups, generalizing some parts of
QEMUFile and improving the way migration sockets are closed.

The last two actually implement the feature.  They are the opposite
change of the nonblocking-blocking change that you implemented for the
migration thread.  However, the change is much simpler because we have
no timers, and because of the use of coroutines.

Without coroutines (and as in non-threaded migration), you have
to proceed in two steps: first collect data in a buffer, then
write it.  This lets you handle EAGAIN only at precise points in
buffered_flush/buffered_put_buffer, so that you can restart writing
in migrate_fd_put_notify.  This checkpointing is the reason why
QEMUFileBuffered exists.  With coroutines, you can just stop whenever
you want with qemu_coroutine_yield.  As soon as select tells you that
you can read, you'll re-enter directly in qemu_get_buffer, read more
data and pass it to the loading routines.

Thanks,

Paolo

Paolo Bonzini (12):
  migration: unify stdio-based QEMUFile operations
  migration: consolidate QEMUFile methods in a single QEMUFileOps struct
  migration: add qemu_get_fd
  migration: replace qemu_stdio_fd with qemu_get_fd
  migration: clean up server sockets and handlers before invoking 
process_incoming_migration
  migration: use migrate_fd_close in migrate_fd_cleanup
  migration: use closesocket, not close
  migration: xxx_close will only be called once
  migration: close socket QEMUFile from socket_close
  migration: move qemu_fclose to process_incoming_migration
  migration: handle EAGAIN while reading QEMUFile
  migration: move process_incoming_migration to a coroutine

 buffered_file.c  |  21 +--
 migration-exec.c |  19 +++---
 migration-fd.c   |  36 +--
 migration-tcp.c  |  19 +++---
 migration-unix.c |  17 +++--
 migration.c  |  47 +-
 qemu-file.h  |  23 ---
 savevm.c | 188 ---
 8 file modificati, 215 inserzioni(+), 155 rimozioni(-)

-- 
1.7.12.1




[Qemu-devel] [PATCH 02/12] migration: consolidate QEMUFile methods in a single QEMUFileOps struct

2012-10-18 Thread Paolo Bonzini
Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 buffered_file.c |  13 ---
 qemu-file.h |  16 +
 savevm.c| 108 +++-
 3 file modificati, 79 inserzioni(+), 58 rimozioni(-)

diff --git a/buffered_file.c b/buffered_file.c
index ed92df1..a5c0b12 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -234,6 +234,14 @@ static void buffered_rate_tick(void *opaque)
 buffered_put_buffer(s, NULL, 0, 0);
 }
 
+static const QEMUFileOps buffered_file_ops = {
+.put_buffer = buffered_put_buffer,
+.close =  buffered_close,
+.rate_limit = buffered_rate_limit,
+.get_rate_limit = buffered_get_rate_limit,
+.set_rate_limit = buffered_set_rate_limit,
+};
+
 QEMUFile *qemu_fopen_ops_buffered(MigrationState *migration_state)
 {
 QEMUFileBuffered *s;
@@ -243,10 +251,7 @@ QEMUFile *qemu_fopen_ops_buffered(MigrationState 
*migration_state)
 s-migration_state = migration_state;
 s-xfer_limit = migration_state-bandwidth_limit / 10;
 
-s-file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
- buffered_close, buffered_rate_limit,
- buffered_set_rate_limit,
-buffered_get_rate_limit);
+s-file = qemu_fopen_ops(s, buffered_file_ops);
 
 s-timer = qemu_new_timer_ms(rt_clock, buffered_rate_tick, s);
 
diff --git a/qemu-file.h b/qemu-file.h
index 9c8985b..c89e8e0 100644
--- a/qemu-file.h
+++ b/qemu-file.h
@@ -59,12 +59,16 @@ typedef int (QEMUFileRateLimit)(void *opaque);
 typedef int64_t (QEMUFileSetRateLimit)(void *opaque, int64_t new_rate);
 typedef int64_t (QEMUFileGetRateLimit)(void *opaque);
 
-QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
- QEMUFileGetBufferFunc *get_buffer,
- QEMUFileCloseFunc *close,
- QEMUFileRateLimit *rate_limit,
- QEMUFileSetRateLimit *set_rate_limit,
- QEMUFileGetRateLimit *get_rate_limit);
+typedef struct QEMUFileOps {
+QEMUFilePutBufferFunc *put_buffer;
+QEMUFileGetBufferFunc *get_buffer;
+QEMUFileCloseFunc *close;
+QEMUFileRateLimit *rate_limit;
+QEMUFileSetRateLimit *set_rate_limit;
+QEMUFileGetRateLimit *get_rate_limit;
+} QEMUFileOps;
+
+QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
 QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd);
diff --git a/savevm.c b/savevm.c
index 7068390..c3ce00a 100644
--- a/savevm.c
+++ b/savevm.c
@@ -162,12 +162,7 @@ void qemu_announce_self(void)
 #define IO_BUF_SIZE 32768
 
 struct QEMUFile {
-QEMUFilePutBufferFunc *put_buffer;
-QEMUFileGetBufferFunc *get_buffer;
-QEMUFileCloseFunc *close;
-QEMUFileRateLimit *rate_limit;
-QEMUFileSetRateLimit *set_rate_limit;
-QEMUFileGetRateLimit *get_rate_limit;
+const QEMUFileOps *ops;
 void *opaque;
 int is_write;
 
@@ -256,6 +251,16 @@ static int stdio_fclose(void *opaque)
 return ret;
 }
 
+static const QEMUFileOps stdio_pipe_read_ops = {
+.get_buffer = stdio_get_buffer,
+.close =  stdio_pclose
+};
+
+static const QEMUFileOps stdio_pipe_write_ops = {
+.put_buffer = stdio_put_buffer,
+.close =  stdio_pclose
+};
+
 QEMUFile *qemu_popen(FILE *stdio_file, const char *mode)
 {
 QEMUFileStdio *s;
@@ -270,11 +275,9 @@ QEMUFile *qemu_popen(FILE *stdio_file, const char *mode)
 s-stdio_file = stdio_file;
 
 if(mode[0] == 'r') {
-s-file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_pclose, 
-NULL, NULL, NULL);
+s-file = qemu_fopen_ops(s, stdio_pipe_read_ops);
 } else {
-s-file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_pclose, 
-NULL, NULL, NULL);
+s-file = qemu_fopen_ops(s, stdio_pipe_write_ops);
 }
 return s-file;
 }
@@ -302,6 +305,16 @@ int qemu_stdio_fd(QEMUFile *f)
 return fd;
 }
 
+static const QEMUFileOps stdio_file_read_ops = {
+.get_buffer = stdio_get_buffer,
+.close =  stdio_fclose
+};
+
+static const QEMUFileOps stdio_file_write_ops = {
+.put_buffer = stdio_put_buffer,
+.close =  stdio_fclose
+};
+
 QEMUFile *qemu_fdopen(int fd, const char *mode)
 {
 QEMUFileStdio *s;
@@ -319,11 +332,9 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
 goto fail;
 
 if(mode[0] == 'r') {
-s-file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_fclose, 
-NULL, NULL, NULL);
+s-file = qemu_fopen_ops(s, stdio_file_read_ops);
 } else {
-s-file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_fclose, 
-NULL, NULL, NULL);
+s-file = qemu_fopen_ops(s, stdio_file_write_ops);
 }
 return s-file;
 
@@ -332,13 +343,17 @@ 

[Qemu-devel] [PATCH 07/12] migration: use closesocket, not close

2012-10-18 Thread Paolo Bonzini
Windows requires this.  Migration does not quite work under Windows
but let's be uniform across QEMU.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
---
 migration-tcp.c | 6 +++---
 1 file modificato, 3 inserzioni(+), 3 rimozioni(-)

diff --git a/migration-tcp.c b/migration-tcp.c
index 1bd8481..e797d23 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -45,7 +45,7 @@ static int tcp_close(MigrationState *s)
 int r = 0;
 DPRINTF(tcp_close\n);
 if (s-fd != -1) {
-if (close(s-fd)  0) {
+if (closesocket(s-fd)  0) {
 r = -errno;
 }
 s-fd = -1;
@@ -97,7 +97,7 @@ static void tcp_accept_incoming_migration(void *opaque)
 c = qemu_accept(s, (struct sockaddr *)addr, addrlen);
 } while (c == -1  socket_error() == EINTR);
 qemu_set_fd_handler2(s, NULL, NULL, NULL, NULL);
-close(s);
+closesocket(s);
 
 DPRINTF(accepted migration\n);
 
@@ -115,7 +115,7 @@ static void tcp_accept_incoming_migration(void *opaque)
 process_incoming_migration(f);
 qemu_fclose(f);
 out:
-close(c);
+closesocket(c);
 }
 
 int tcp_start_incoming_migration(const char *host_port, Error **errp)
-- 
1.7.12.1





  1   2   3   >