date:20130717

[Qemu-devel] RFC: [PATCH 0/4] xics: in-kernel support

2013-07-17 Thread Alexey Kardashevskiy


This is rework of in-kernel XICS on top of [PATCH 00/11] pseries: migration
and QOM support + compile fix patch + XICS migration fix patch.

Migration from XICS to XICS-KVM and vice versa works.

In this series, XICS-KVM inherits from XICS. I do not really see the point of
adding one more XICS-common here.

Any comments? Thanks.


Alexey Kardashevskiy (2):
  xics: add pre_save/post_load/cpu_setup dispatchers
  xics: rework initialization

David Gibson (2):
  target-ppc: Add helper for KVM_PPC_RTAS_DEFINE_TOKEN
  xics: Support for in-kernel XICS interrupt controller

 default-configs/ppc64-softmmu.mak |   1 +
 hw/intc/Makefile.objs |   1 +
 hw/intc/xics.c|  96 ++--
 hw/intc/xics_kvm.c| 469 ++
 hw/ppc/spapr.c|  30 ++-
 include/hw/ppc/xics.h |  56 -
 target-ppc/kvm.c  |  14 ++
 target-ppc/kvm_ppc.h  |   7 +
 8 files changed, 646 insertions(+), 28 deletions(-)
 create mode 100644 hw/intc/xics_kvm.c

-- 
1.8.3.2

[Qemu-devel] [PATCH 1/4] target-ppc: Add helper for KVM_PPC_RTAS_DEFINE_TOKEN

2013-07-17 Thread Alexey Kardashevskiy

From: David Gibson da...@gibson.dropbear.id.au

Recent PowerKVM allows the kernel to intercept some RTAS calls from the
guest directly.  This is used to implement the more efficient in-kernel
XICS for example.  qemu is still responsible for assigning the RTAS token
numbers however, and needs to tell the kernel which RTAS function name is
assigned to a given token value.  This patch adds a convenience wrapper for
the KVM_PPC_RTAS_DEFINE_TOKEN ioctl() which is used for this purpose.

Signed-off-by: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 target-ppc/kvm.c | 14 ++
 target-ppc/kvm_ppc.h |  7 +++
 2 files changed, 21 insertions(+)

diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 27e2aaf..606bdb9 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1789,6 +1789,20 @@ static int kvm_ppc_register_host_cpu_type(void)
 return 0;
 }
 
+int kvmppc_define_rtas_token(uint32_t token, const char *function)
+{
+struct kvm_rtas_token_args args = {
+.token = token,
+};
+
+if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_RTAS)) {
+return -ENOENT;
+}
+
+strncpy(args.name, function, sizeof(args.name));
+
+return kvm_vm_ioctl(kvm_state, KVM_PPC_RTAS_DEFINE_TOKEN, args);
+}
 
 int kvmppc_get_htab_fd(bool write)
 {
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 4ae7bf2..12564ef 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -38,6 +38,7 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int 
hash_shift);
 #endif /* !CONFIG_USER_ONLY */
 int kvmppc_fixup_cpu(PowerPCCPU *cpu);
 bool kvmppc_has_cap_epr(void);
+int kvmppc_define_rtas_token(uint32_t token, const char *function);
 int kvmppc_get_htab_fd(bool write);
 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns);
 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
@@ -164,6 +165,12 @@ static inline bool kvmppc_has_cap_epr(void)
 return false;
 }
 
+static inline int kvmppc_define_rtas_token(uint32_t token,
+   const char *function)
+{
+return -1;
+}
+
 static inline int kvmppc_get_htab_fd(bool write)
 {
 return -1;
-- 
1.8.3.2

[Qemu-devel] [PATCH 2/4] xics: add pre_save/post_load/cpu_setup dispatchers

2013-07-17 Thread Alexey Kardashevskiy

The upcoming support of in-kernel XICS will redefine migration callbacks
for both ICS and ICP so classes and callback pointers are added.

This adds a cpu_setup callback to the XICS device class (as XICS-KVM
will do it different) and xics_dispatch_cpu_setup(). This also moves
the place where xics_dispatch_cpu_setup() is called a bit further to
have VCPU initialized (required for XICS-KVM).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/intc/xics.c| 61 +++
 hw/ppc/spapr.c|  4 ++--
 include/hw/ppc/xics.h | 46 +-
 3 files changed, 104 insertions(+), 7 deletions(-)

diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index 6b3c071..283c2dd 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -153,11 +153,35 @@ static void icp_irq(XICSState *icp, int server, int nr, 
uint8_t priority)
 }
 }
 
+static void icp_dispatch_pre_save(void *opaque)
+{
+ICPState *ss = opaque;
+ICPStateClass *info = ICP_GET_CLASS(ss);
+
+if (info-pre_save) {
+info-pre_save(ss);
+}
+}
+
+static int icp_dispatch_post_load(void *opaque, int version_id)
+{
+ICPState *ss = opaque;
+ICPStateClass *info = ICP_GET_CLASS(ss);
+
+if (info-post_load) {
+info-post_load(ss);
+}
+
+return 0;
+}
+
 static const VMStateDescription vmstate_icp_server = {
 .name = icp/server,
 .version_id = 1,
 .minimum_version_id = 1,
 .minimum_version_id_old = 1,
+.pre_save = icp_dispatch_pre_save,
+.post_load = icp_dispatch_post_load,
 .fields  = (VMStateField []) {
 /* Sanity check */
 VMSTATE_UINT32(xirr, ICPState),
@@ -192,6 +216,7 @@ static TypeInfo icp_info = {
 .parent = TYPE_DEVICE,
 .instance_size = sizeof(ICPState),
 .class_init = icp_class_init,
+.class_size = sizeof(ICPStateClass),
 };
 
 /*
@@ -353,10 +378,9 @@ static void ics_reset(DeviceState *dev)
 }
 }
 
-static int ics_post_load(void *opaque, int version_id)
+static int ics_post_load(ICSState *ics)
 {
 int i;
-ICSState *ics = opaque;
 
 for (i = 0; i  ics-icp-nr_servers; i++) {
 icp_resend(ics-icp, i);
@@ -365,6 +389,28 @@ static int ics_post_load(void *opaque, int version_id)
 return 0;
 }
 
+static void ics_dispatch_pre_save(void *opaque)
+{
+ICSState *ics = opaque;
+ICSStateClass *info = ICS_GET_CLASS(ics);
+
+if (info-pre_save) {
+info-pre_save(ics);
+}
+}
+
+static int ics_dispatch_post_load(void *opaque, int version_id)
+{
+ICSState *ics = opaque;
+ICSStateClass *info = ICS_GET_CLASS(ics);
+
+if (info-post_load) {
+info-post_load(ics);
+}
+
+return 0;
+}
+
 static const VMStateDescription vmstate_ics_irq = {
 .name = ics/irq,
 .version_id = 1,
@@ -384,7 +430,8 @@ static const VMStateDescription vmstate_ics = {
 .version_id = 1,
 .minimum_version_id = 1,
 .minimum_version_id_old = 1,
-.post_load = ics_post_load,
+.pre_save = ics_dispatch_pre_save,
+.post_load = ics_dispatch_post_load,
 .fields  = (VMStateField []) {
 /* Sanity check */
 VMSTATE_UINT32_EQUAL(nr_irqs, ICSState),
@@ -409,10 +456,12 @@ static int ics_realize(DeviceState *dev)
 static void ics_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
+ICSStateClass *k = ICS_CLASS(klass);
 
 dc-init = ics_realize;
 dc-vmsd = vmstate_ics;
 dc-reset = ics_reset;
+k-post_load = ics_post_load;
 }
 
 static TypeInfo ics_info = {
@@ -420,6 +469,7 @@ static TypeInfo ics_info = {
 .parent = TYPE_DEVICE,
 .instance_size = sizeof(ICSState),
 .class_init = ics_class_init,
+.class_size = sizeof(ICSStateClass),
 };
 
 /*
@@ -612,7 +662,7 @@ static void xics_reset(DeviceState *d)
 device_reset(DEVICE(icp-ics));
 }
 
-void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
+static void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
 {
 CPUState *cs = CPU(cpu);
 CPUPPCState *env = cpu-env;
@@ -674,10 +724,12 @@ static Property xics_properties[] = {
 static void xics_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
+XICSStateClass *k = XICS_CLASS(oc);
 
 dc-realize = xics_realize;
 dc-props = xics_properties;
 dc-reset = xics_reset;
+k-cpu_setup = xics_cpu_setup;
 
 spapr_rtas_register(ibm,set-xive, rtas_set_xive);
 spapr_rtas_register(ibm,get-xive, rtas_get_xive);
@@ -694,6 +746,7 @@ static const TypeInfo xics_info = {
 .name  = TYPE_XICS,
 .parent= TYPE_SYS_BUS_DEVICE,
 .instance_size = sizeof(XICSState),
+.class_size = sizeof(XICSStateClass),
 .class_init= xics_class_init,
 .instance_init = xics_initfn,
 };
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 16bfab9..432f0d2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1155,8 +1155,6 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
 }

[Qemu-devel] [PATCH 3/4] xics: rework initialization

2013-07-17 Thread Alexey Kardashevskiy

Currently RTAS and hypercalls are registered in the XICS class init
function. The upcoming XICS-KVM will inherit from XICS but will use
another API to register RTAS tokens with KVM so registration has
to move from the class init function (common for both XICS and
XICS-KVM) to the _realize function (specific to the controller).

This moves ICS creation to _realize as there is no point to create
some child devices in initfn() (ICS) and some in realize() (ICPs).

As initfn is no more needed, this removes it.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/intc/xics.c | 35 +++
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index 283c2dd..a359f52 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -689,9 +689,23 @@ static void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
 static void xics_realize(DeviceState *dev, Error **errp)
 {
 XICSState *icp = XICS(dev);
-ICSState *ics = icp-ics;
+ICSState *ics;
 int i;
 
+spapr_rtas_register(ibm,set-xive, rtas_set_xive);
+spapr_rtas_register(ibm,get-xive, rtas_get_xive);
+spapr_rtas_register(ibm,int-off, rtas_int_off);
+spapr_rtas_register(ibm,int-on, rtas_int_on);
+
+spapr_register_hypercall(H_CPPR, h_cppr);
+spapr_register_hypercall(H_IPI, h_ipi);
+spapr_register_hypercall(H_XIRR, h_xirr);
+spapr_register_hypercall(H_EOI, h_eoi);
+
+icp-ics = ICS(object_new(TYPE_ICS));
+ics = icp-ics;
+object_property_add_child(OBJECT(icp), ics, OBJECT(icp-ics), NULL);
+
 ics-nr_irqs = icp-nr_irqs;
 ics-offset = XICS_IRQ_BASE;
 ics-icp = icp;
@@ -707,14 +721,6 @@ static void xics_realize(DeviceState *dev, Error **errp)
 }
 }
 
-static void xics_initfn(Object *obj)
-{
-XICSState *xics = XICS(obj);
-
-xics-ics = ICS(object_new(TYPE_ICS));
-object_property_add_child(obj, ics, OBJECT(xics-ics), NULL);
-}
-
 static Property xics_properties[] = {
 DEFINE_PROP_UINT32(nr_servers, XICSState, nr_servers, -1),
 DEFINE_PROP_UINT32(nr_irqs, XICSState, nr_irqs, -1),
@@ -730,16 +736,6 @@ static void xics_class_init(ObjectClass *oc, void *data)
 dc-props = xics_properties;
 dc-reset = xics_reset;
 k-cpu_setup = xics_cpu_setup;
-
-spapr_rtas_register(ibm,set-xive, rtas_set_xive);
-spapr_rtas_register(ibm,get-xive, rtas_get_xive);
-spapr_rtas_register(ibm,int-off, rtas_int_off);
-spapr_rtas_register(ibm,int-on, rtas_int_on);
-
-spapr_register_hypercall(H_CPPR, h_cppr);
-spapr_register_hypercall(H_IPI, h_ipi);
-spapr_register_hypercall(H_XIRR, h_xirr);
-spapr_register_hypercall(H_EOI, h_eoi);
 }
 
 static const TypeInfo xics_info = {
@@ -748,7 +744,6 @@ static const TypeInfo xics_info = {
 .instance_size = sizeof(XICSState),
 .class_size = sizeof(XICSStateClass),
 .class_init= xics_class_init,
-.instance_init = xics_initfn,
 };
 
 static void xics_register_types(void)
-- 
1.8.3.2

[Qemu-devel] [PATCH 4/4] xics: Support for in-kernel XICS interrupt controller

2013-07-17 Thread Alexey Kardashevskiy

From: David Gibson da...@gibson.dropbear.id.au

Recent (host) kernels support emulating the PAPR defined XICS interrupt
controller system within KVM.  This patch allows qemu to initialize and
configure the in-kernel XICS, and keep its state in sync with qemu's XICS
state as necessary.

This should give considerable performance improvements.  e.g. on a simple
IPI ping-pong test between hardware threads, using qemu XICS gives us
around 5,000 irqs/second, whereas the in-kernel XICS gives us around
70,000 irqs/s on the same hardware configuration.

[Mike Qiu qiud...@linux.vnet.ibm.com: fixed mistype which caused 
ics_set_kvm_state() to fail]
Signed-off-by: David Gibson da...@gibson.dropbear.id.au
[aik: moved to a separate device, reworked QOM]

---
Changes:
2013/07/17:
* QOM rework

2013/07/01
* fixed VMState names in order to support xics-kvm migration to xics and vice 
versa

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 default-configs/ppc64-softmmu.mak |   1 +
 hw/intc/Makefile.objs |   1 +
 hw/intc/xics_kvm.c| 469 ++
 hw/ppc/spapr.c|  26 ++-
 include/hw/ppc/xics.h |  10 +
 5 files changed, 506 insertions(+), 1 deletion(-)
 create mode 100644 hw/intc/xics_kvm.c

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index 6d1933b..1f4550a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -47,5 +47,6 @@ CONFIG_E500=y
 CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM))
 # For pSeries
 CONFIG_XICS=$(CONFIG_PSERIES)
+CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
 # For PReP
 CONFIG_MC146818RTC=y
diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs
index 2851eed..47ac442 100644
--- a/hw/intc/Makefile.objs
+++ b/hw/intc/Makefile.objs
@@ -23,3 +23,4 @@ obj-$(CONFIG_OMAP) += omap_intc.o
 obj-$(CONFIG_OPENPIC_KVM) += openpic_kvm.o
 obj-$(CONFIG_SH4) += sh_intc.o
 obj-$(CONFIG_XICS) += xics.o
+obj-$(CONFIG_XICS_KVM) += xics_kvm.o
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
new file mode 100644
index 000..04ce1be
--- /dev/null
+++ b/hw/intc/xics_kvm.c
@@ -0,0 +1,469 @@
+/*
+ * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
+ *
+ * PAPR Virtualized Interrupt System, aka ICS/ICP aka xics, in-kernel emulation
+ *
+ * Copyright (c) 2013 David Gibson, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ */
+
+#include hw/hw.h
+#include trace.h
+#include hw/ppc/spapr.h
+#include hw/ppc/xics.h
+#include kvm_ppc.h
+#include qemu/config-file.h
+
+#include sys/ioctl.h
+
+typedef struct XICSStateKVM {
+struct XICSState parent;
+
+uint32_t set_xive_token;
+uint32_t get_xive_token;
+uint32_t int_off_token;
+uint32_t int_on_token;
+int kernel_xics_fd;
+} XICSStateKVM;
+
+/*
+ * ICP-KVM
+ */
+static void icp_get_kvm_state(ICPState *ss)
+{
+uint64_t state;
+struct kvm_one_reg reg = {
+.id = KVM_REG_PPC_ICP_STATE,
+.addr = (uintptr_t)state,
+};
+int ret;
+
+if (!ss-cs) {
+return; /* kernel irqchip not in use */
+}
+
+ret = kvm_vcpu_ioctl(ss-cs, KVM_GET_ONE_REG, reg);
+if (ret != 0) {
+fprintf(stderr, Unable to retrieve KVM interrupt controller state
+ for CPU %d: %s\n, ss-cs-cpu_index, strerror(errno));
+exit(1);
+}
+
+ss-xirr = state  KVM_REG_PPC_ICP_XISR_SHIFT;
+ss-mfrr = (state  KVM_REG_PPC_ICP_MFRR_SHIFT)
+ KVM_REG_PPC_ICP_MFRR_MASK;
+ss-pending_priority = (state  KVM_REG_PPC_ICP_PPRI_SHIFT)
+ KVM_REG_PPC_ICP_PPRI_MASK;
+}
+
+static int icp_set_kvm_state(ICPState *ss)
+{
+uint64_t state;
+struct kvm_one_reg reg = {
+.id = KVM_REG_PPC_ICP_STATE,
+.addr = (uintptr_t)state,
+};
+int ret;
+
+if (!ss-cs) {
+return 0; /* kernel irqchip not in use */
+}
+
+state =

Re: [Qemu-devel] [PATCH v2] gtk: Fix accelerator filtering

2013-07-17 Thread Jan Kiszka

On 2013-05-08 00:42, Jan Kiszka wrote:
 On 2013-05-07 23:03, Jordan Justen wrote:
 On Sun, Mar 24, 2013 at 11:06 AM, Jan Kiszka jan.kis...@web.de wrote:
 On 2013-02-25 16:44, Jan Kiszka wrote:
 On 2013-02-25 16:39, Anthony Liguori wrote:
 Jan Kiszka jan.kis...@siemens.com writes:

 This is in fact very simply: When the input in grabbed, everything
 should be exclusively passed to the guest - except it has our magic
 CTRL-ALT modifier set. Then let GTK filter out those accels that are in
 use. When checking the modifier state, we just need to filter out NUM
 and CAPS lock.

 Can you explain what you're fixing?

 That it's not filtering what it is supposed to.


 We shouldn't hard code modifiers like this.  The reason you give
 accelerators paths like this is so that they can be overridden by a
 user.

 That's why I filtered by path.  Once we're running, we shouldn't assume
 that accelerators use the modifiers we started with.

 Your path-based filtering does not work as it uses an unsupported
 internal function (see my other mail).

 We can make the modifier configurable via QEMU means (command line
 parameter, gconfig, whatever). But let's get the basics working first.

 The bug still exists, my patch still applies. Unless you have some idea
 for a better solution, please apply this for now so that CTRL-q inside a
 guest doesn't kill more kittens.

 I finally built qemu with gtk support, and in general it seems like a
 great improvement over SDL.

 ...except ctrl-q to quit the VM. Why is binding a hotkey to quit a
 good idea at all? It seems kind of like attaching your computer's
 power to a wall-switch. :)
 
 Yeah, this bug should really be fixed in some way before 1.5 is released
 because GTK will be default. Anthony, what is the status of your
 experiments with alternative solutions?

Re-ping on this, now for 1.6 (with the option to backport the fix).

What defines the modifiers to be used for the accelerators? Is there an
interface to query them (to avoid my original hard-coding)?

Jan



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 05/11] spapr-tce: make sPAPRTCETable a proper device

2013-07-17 Thread Alexey Kardashevskiy

On 07/16/2013 01:11 AM, Anthony Liguori wrote:
 Model TCE tables as a device that's hooked up as a child object to
 the owner.  Besides the code cleanup, we get a few nice benefits:
 
 1) free actually works now (it was dead code before)
 
 2) the TCE information is visible in the device tree
 
 3) we can expose table information as properties such that if we
change the window_size, we can use globals to keep migration
working.
 
 Signed-off-by: David Gibson da...@gibson.dropbear.id.au
 [dwg: pseries: savevm support for PAPR TCE tables]
 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 ---
  hw/ppc/spapr.c |   3 -
  hw/ppc/spapr_iommu.c   | 145 
 -
  hw/ppc/spapr_pci.c |   2 +-
  hw/ppc/spapr_vio.c |   2 +-
  include/hw/ppc/spapr.h |  23 +---
  5 files changed, 114 insertions(+), 61 deletions(-)
 
 diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
 index 48ae092..e340708 100644
 --- a/hw/ppc/spapr.c
 +++ b/hw/ppc/spapr.c
 @@ -848,9 +848,6 @@ static void ppc_spapr_init(QEMUMachineInitArgs *args)
  /* Set up EPOW events infrastructure */
  spapr_events_init(spapr);
  
 -/* Set up IOMMU */
 -spapr_iommu_init();
 -
  /* Set up VIO bus */
  spapr-vio_bus = spapr_vio_bus_init();
  
 diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
 index 89b33a5..709cc34 100644
 --- a/hw/ppc/spapr_iommu.c
 +++ b/hw/ppc/spapr_iommu.c
 @@ -36,17 +36,6 @@ enum sPAPRTCEAccess {
  SPAPR_TCE_RW = 3,
  };
  
 -struct sPAPRTCETable {
 -uint32_t liobn;
 -uint32_t window_size;
 -sPAPRTCE *table;
 -bool bypass;
 -int fd;
 -MemoryRegion iommu;
 -QLIST_ENTRY(sPAPRTCETable) list;
 -};
 -
 -
  QLIST_HEAD(spapr_tce_tables, sPAPRTCETable) spapr_tce_tables;
  
  static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn)
 @@ -96,7 +85,7 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion 
 *iommu, hwaddr addr)
  return (IOMMUTLBEntry) { .perm = IOMMU_NONE };
  }
  
 -tce = tcet-table[addr  SPAPR_TCE_PAGE_SHIFT].tce;
 +tce = tcet-table[addr  SPAPR_TCE_PAGE_SHIFT];
  
  #ifdef DEBUG_TCE
  fprintf(stderr,  -  *paddr=0x%llx, *len=0x%llx\n,
 @@ -112,37 +101,51 @@ static IOMMUTLBEntry 
 spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr)
  };
  }
  
 +static int spapr_tce_table_pre_load(void *opaque)
 +{
 +sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
 +
 +tcet-nb_table = tcet-window_size  SPAPR_TCE_PAGE_SHIFT;
 +
 +return 0;
 +}
 +
 +static const VMStateDescription vmstate_spapr_tce_table = {
 +.name = spapr_iommu,
 +.version_id = 1,
 +.minimum_version_id = 1,
 +.minimum_version_id_old = 1,
 +.pre_load = spapr_tce_table_pre_load,
 +.fields  = (VMStateField []) {
 +/* Sanity check */
 +VMSTATE_UINT32_EQUAL(liobn, sPAPRTCETable),
 +VMSTATE_UINT32_EQUAL(window_size, sPAPRTCETable),
 +
 +/* IOMMU state */
 +VMSTATE_BOOL(bypass, sPAPRTCETable),
 +VMSTATE_VARRAY_UINT32(table, sPAPRTCETable, nb_table, 0, 
 vmstate_info_uint64, uint64_t),
 +
 +VMSTATE_END_OF_LIST()
 +},
 +};
 +
  static MemoryRegionIOMMUOps spapr_iommu_ops = {
  .translate = spapr_tce_translate_iommu,
  };
  
 -sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, 
 size_t window_size)
 +static int spapr_tce_table_realize(DeviceState *dev)
  {
 -sPAPRTCETable *tcet;
 -
 -if (spapr_tce_find_by_liobn(liobn)) {
 -fprintf(stderr, Attempted to create TCE table with duplicate
 - LIOBN 0x%x\n, liobn);
 -return NULL;
 -}
 -
 -if (!window_size) {
 -return NULL;
 -}
 -
 -tcet = g_malloc0(sizeof(*tcet));
 -tcet-liobn = liobn;
 -tcet-window_size = window_size;
 +sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev);
  
  if (kvm_enabled()) {
 -tcet-table = kvmppc_create_spapr_tce(liobn,
 -  window_size,
 +tcet-table = kvmppc_create_spapr_tce(tcet-liobn,
 +  tcet-window_size,
tcet-fd);
  }
  
  if (!tcet-table) {
 -size_t table_size = (window_size  SPAPR_TCE_PAGE_SHIFT)
 -* sizeof(sPAPRTCE);
 +size_t table_size = (tcet-window_size  SPAPR_TCE_PAGE_SHIFT)
 +* sizeof(uint64_t);
  tcet-table = g_malloc0(table_size);
  }
  
 @@ -151,16 +154,43 @@ sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, 
 uint32_t liobn, size_t wi
  table @ %p, fd=%d\n, tcet, liobn, tcet-table, tcet-fd);
  #endif
  
 -memory_region_init_iommu(tcet-iommu, OBJECT(owner), spapr_iommu_ops,
 +memory_region_init_iommu(tcet-iommu, OBJECT(dev), spapr_iommu_ops,
   iommu-spapr, UINT64_MAX);
  
  QLIST_INSERT_HEAD(spapr_tce_tables, tcet, list);
  
 +return 0;
 +}
 +
 +sPAPRTCETable

[Qemu-devel] [PATCH] block: fix vvfat s-qcow leak on error

2013-07-17 Thread Fam Zheng

s-qcow is allocated but not freed if bdrv_open fails. Fix the possible
leak, remove unnecessary check for bdrv_new(), honor error code of
bdrv_create().

Signed-off-by: Fam Zheng f...@redhat.com
---
 block/vvfat.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 87b0279..733f382 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2927,18 +2927,18 @@ static int enable_write_target(BDRVVVFATState *s)
 set_option_parameter_int(options, BLOCK_OPT_SIZE, s-sector_count * 512);
 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, fat:);
 
-if (bdrv_create(bdrv_qcow, s-qcow_filename, options)  0)
-   return -1;
+ret = bdrv_create(bdrv_qcow, s-qcow_filename, options);
+if (ret  0) {
+return ret;
+}
 
 s-qcow = bdrv_new();
-if (s-qcow == NULL) {
-return -1;
-}
 
 ret = bdrv_open(s-qcow, s-qcow_filename, NULL,
 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
 if (ret  0) {
-   return ret;
+bdrv_delete(s-qcow);
+return ret;
 }
 
 #ifndef _WIN32
-- 
1.8.3.2

Re: [Qemu-devel] [PATCH] [RFC] aio/async: Add timed bottom-halves

2013-07-17 Thread Kevin Wolf

Am 16.07.2013 um 18:55 hat Paolo Bonzini geschrieben:
  But I wouldn't introduce a
  new one-off concept (almost as much of a hack as idle BHs), I would
  rather reuse as much code as possible from QEMUTimer/QEMUClock.  I must
  admit I don't have a clear idea of how the API would look like.
  
  So the reason I was trying to avoid using QEMUTimer stuff was that
  bh's get called from aio_poll and it was not evident that all timers
  would be safe to call from aio_poll.
 
 It wouldn't.
 
  What do you think? In the end I thought the schedule_bh_at stuff
  was simpler.
 
 It is simpler, but I'm not sure it is the right API.  Of course, if
 Kevin and Stefan says it is, I have no problem with that.

Well, the one thing I'm pretty sure we need is an additional interface,
so that existing timers continue to run only from the main loop, and new
timers / timed BHs / whatever have the option to run in nested event
loops as well.

I don't really care what the thing is called as long as it does what is
needed. Timed BHs seemed to do that, so I agreed.

Kevin

Re: [Qemu-devel] [PATCH] [RFC] aio/async: Add timed bottom-halves

2013-07-17 Thread Alex Bligh


Stefan,

--On 17 July 2013 11:02:30 +0800 Stefan Hajnoczi stefa...@gmail.com wrote:


The steps to achieving this:

1. Drop alarm timers from qemu-timer.c and calculate g_poll() timeout
   instead for the main loop.

2. Introduce a per-AioContext aio_ctx_clock that can be used with
   qemu_new_timer() to create a QEMUTimer that expires during
   aio_poll().

3. Calculate g_poll() timeout for aio_ctx_clock in aio_poll().


A couple of questions:

1. How would this work where the user has no main loop, e.g. qemu-img? A
  block driver may well still need timers.

2. Are we guaranteed that no aio_poll user can repeatedly call aio_poll
  without exiting to the main loop?

3. Is it safe to anything you can do in a bh in a timer? IE are users every
  going to need to schedule a bh from a timer? If so, this seems a bit
  long winded for users that want bh functionality.

The timed bh solution (which I'm by no means set on) can use any QEMUClock
in the bh, so you get to choose the clock per BH, not per AioContext,
which may or may not have advantages.

--
Alex Bligh

Re: [Qemu-devel] [PATCH] fix guest physical bits to match host, to go beyond 1TB guests

2013-07-17 Thread Paolo Bonzini

Il 16/07/2013 21:42, Eduardo Habkost ha scritto:
 On Tue, Jul 16, 2013 at 09:24:30PM +0200, Paolo Bonzini wrote:
 Il 16/07/2013 20:11, Eduardo Habkost ha scritto:
 For physical bit size, what about extending it in a backwards-compatible
 way? Something like this:

 *eax = 0x0003000; /* 48 bits virtual */
 if (ram_size  1TB) {
 physical_size = 40; /* Keeping backwards compatibility */
 } else if (ram_size  4TB) {
 physical_size = 42;

 Why not go straight up to 44?
 
 I simply trusted the comment saying: The physical address space is
 limited to 42 bits in exec.c, and assumed we had a 42-bit limit
 somewhere else.

Yeah, that's obsolete.  We now can go up to 64 (but actually only
support 52 because that's what Intel says will be the limit4PB RAM
should be good for everyone, as Bill Gates used to say).

So far Intel has been upgrading the physical RAM size in x16 steps
(MAXPHYADDR was 36, then 40, then 44).  MAXPHYADDR is how Intel calls
what you wrote as physical_size.

 if (ram_size  1TB) {
 physical_size = 40; /* Keeping backwards compatibility */
 } else {
 physical_size = msb(ram_size);
 }
 if (supported_host_physical_size()  physical_size) {
 abort();
 }

Not enough because there are processors with 36.  So perhaps, putting
together both of your ideas:

 if (supported_host_physical_size()  msb(ram_size)) {
 abort();
 }
 if (ram_size  64GB  !some_compat_prop) {
 physical_size = 36;
 } else if (ram_size  1TB) {
 physical_size = 40;
 } else {
 physical_size = 44;
 }

What do you think?

 This makes sense too.  Though the best would be of course to use CPUID
 values coming from the real processors, and only using 40 for backwards
 compatibility.
 
 We can't use the values coming from the real processors directly, or
 we will break live migration.

I said real processors, not host processors. :)

So a Core 2 has MAXPHYADDR=36, Nehalem has IIRC 40, Sandy Bridge has 44,
and so on.

 If we sent those CPUID bits as part of the migration stream, it would
 make it a little safer, but then it would be impossible for libvirt to
 tell if it is really possible to migrate from one host to another.

The libvirt problem still remains, since libvirt currently doesn't know
the MAXPHYADDRs and would have to learn them.

I guess the above artificial computation of MAXPHYADDR is enough.

Paolo

[Qemu-devel] [PATCH 1/2][RESENT] Add GDB qAttached support

2013-07-17 Thread Jan Kiszka

With this patch QEMU handles qAttached request from gdb. When QEMU
replies 1, GDB sends a detach command at the end of a debugging
session otherwise GDB sends kill.

The default value for qAttached is 1 on system emulation and 0 on user
emulation.

Based on original version by Fabien Chouteau.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---
 gdbstub.c |   10 ++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/gdbstub.c b/gdbstub.c
index 0ee82a9..bc626f5 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -42,6 +42,12 @@
 #include sysemu/kvm.h
 #include qemu/bitops.h
 
+#ifdef CONFIG_USER_ONLY
+#define GDB_ATTACHED 0
+#else
+#define GDB_ATTACHED 1
+#endif
+
 #ifndef TARGET_CPU_MEMORY_RW_DEBUG
 static inline int target_memory_rw_debug(CPUArchState *env, target_ulong addr,
  uint8_t *buf, int len, int is_write)
@@ -2504,6 +2510,10 @@ static int gdb_handle_packet(GDBState *s, const char 
*line_buf)
 break;
 }
 #endif
+if (strncmp(p, Attached, 8) == 0) {
+put_packet(s, GDB_ATTACHED);
+break;
+}
 /* Unrecognised 'q' command.  */
 goto unknown_command;
 
-- 
1.7.3.4

[Qemu-devel] [PATCH 2/2][RESENT] Revert gdbstub: Do not kill target in system emulation mode

2013-07-17 Thread Jan Kiszka

The requirements described in this patch are implemented by Add GDB
qAttached support.

This reverts commit 00e94dbc7fd0110b0555d59592b004333adfb4b8.

Signed-off-by: Fabien Chouteau chout...@adacore.com
---
 gdbstub.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/gdbstub.c b/gdbstub.c
index bc626f5..8c632ab 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2198,11 +2198,9 @@ static int gdb_handle_packet(GDBState *s, const char 
*line_buf)
 goto unknown_command;
 }
 case 'k':
-#ifdef CONFIG_USER_ONLY
 /* Kill the target */
 fprintf(stderr, \nQEMU: Terminated via GDBstub\n);
 exit(0);
-#endif
 case 'D':
 /* Detach packet */
 gdb_breakpoint_remove_all();
-- 
1.7.3.4

Re: [Qemu-devel] [PATCH] block: fix vvfat s-qcow leak on error

2013-07-17 Thread Laszlo Ersek

On 07/17/13 09:12, Fam Zheng wrote:
 s-qcow is allocated but not freed if bdrv_open fails. Fix the possible
 leak, remove unnecessary check for bdrv_new(), honor error code of
 bdrv_create().
 
 Signed-off-by: Fam Zheng f...@redhat.com
 ---
  block/vvfat.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)
 
 diff --git a/block/vvfat.c b/block/vvfat.c
 index 87b0279..733f382 100644
 --- a/block/vvfat.c
 +++ b/block/vvfat.c
 @@ -2927,18 +2927,18 @@ static int enable_write_target(BDRVVVFATState *s)
  set_option_parameter_int(options, BLOCK_OPT_SIZE, s-sector_count * 512);
  set_option_parameter(options, BLOCK_OPT_BACKING_FILE, fat:);
  
 -if (bdrv_create(bdrv_qcow, s-qcow_filename, options)  0)
 - return -1;
 +ret = bdrv_create(bdrv_qcow, s-qcow_filename, options);
 +if (ret  0) {
 +return ret;
 +}

(1) This still seems to leak s-qcow_filename. Maybe that's not an
actual leak (the reference is not lost), but the error handler just a
little bit higher up frees s-qcow_filename and sets it to NULL if
get_tmp_filename() fails.

If this remark is justified then it could apply to the other error branches.

(2) vvfat_open() calls enable_write_target() but doesn't propagate its
retval, any error results in -EIO. Would that be worth fixing as well?

  
  s-qcow = bdrv_new();
 -if (s-qcow == NULL) {
 -return -1;
 -}
  
  ret = bdrv_open(s-qcow, s-qcow_filename, NULL,
  BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
  if (ret  0) {
 - return ret;
 +bdrv_delete(s-qcow);
 +return ret;
  }
  
  #ifndef _WIN32
 

Thanks
Laszlo

Re: [Qemu-devel] [PATCH] [RFC] aio/async: Add timed bottom-halves

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 10:07, Alex Bligh ha scritto:
 Stefan,
 
 --On 17 July 2013 11:02:30 +0800 Stefan Hajnoczi stefa...@gmail.com
 wrote:
 
 The steps to achieving this:

 1. Drop alarm timers from qemu-timer.c and calculate g_poll() timeout
instead for the main loop.

 2. Introduce a per-AioContext aio_ctx_clock that can be used with
qemu_new_timer() to create a QEMUTimer that expires during
aio_poll().

 3. Calculate g_poll() timeout for aio_ctx_clock in aio_poll().
 
 A couple of questions:
 
 1. How would this work where the user has no main loop, e.g. qemu-img? A
   block driver may well still need timers.

The block driver should only use aio_ctx_clock, and those _would_ be
handled in aio_poll().

 3. Is it safe to anything you can do in a bh in a timer? IE are users every
   going to need to schedule a bh from a timer? If so, this seems a bit
   long winded for users that want bh functionality.

It is safe.

Paolo

Re: [Qemu-devel] [RFC V8 01/24] qcow2: Add journal specification.

2013-07-17 Thread Kevin Wolf

Am 17.07.2013 um 00:45 hat Benoît Canet geschrieben:
   Simple is good.  Even for deduplication alone, I think data integrity is
   critical - otherwise we risk stale dedup metadata pointing to clusters
   that are unallocated or do not contain the right data.  So the journal
   will probably need to follow techniques for commits/checksums.
 
 
 I'll add checksums to the journal and clean the journal entry size mess soon.
 
 For the transactional/commits aspect of the journal I think that we need 
 Kevin's
 point of view on the subject.

Sorry, I was going to prepare a patch that does journalling for the
existing metadata, but once again other things stole my time. I'm still
planning to do that, though, and then we can compare whether your
requirements are fulfilled with it as well.

Kevin

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Fabien Chouteau

On 07/16/2013 06:54 PM, Scott Wood wrote:
 On 07/16/2013 11:15:51 AM, Fabien Chouteau wrote:
 On 07/16/2013 05:37 PM, Alexander Graf wrote:
  On 07/16/2013 05:28 PM, Fabien Chouteau wrote:
  On 07/16/2013 04:06 AM, Scott Wood wrote:
  On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
  This implementation doesn't include ring priority, TCP/IP Off-Load, QoS.
 
  Signed-off-by: Fabien Chouteauchout...@adacore.com
   From the code comments I gather this has been tested on VxWorks.  Has it
  been tested on Linux, or anywhere else?
 
  You're right, as I said in the cover letter, this has only been tested on 
  vxWorks.
 
  Could you please give it a try? IIRC eTSEC support should be in upstream 
  Linux.
 

 I don't have time for that. As I said in the cover letter, I submit this
 patch for those interested in eTSEC, but I won't be able to test/fix it
 for Linux.
 
 Could you please at least document more fully the known limitations, such as 
 I'm only interested in 32bits address spaces?


I will, but this device is very complex and I don't even know all the
limitation of my implementation ;)

  +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
  +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
  +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
  ~0x7;
  What about RBDBPH (upper bits of physical address)?  Likewise for TX.
 
  I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH 
  or TBDBPH.
 
  Why? I thought e500mc and above can access more than 32bits of physical 
  address space?

 Yes but this is not emulated by QEMU, right? sizeof (hwaddr) for
 qemu-system-ppc is 8...
 
 36bit physical is emulated by QEMU.  Currently we put CCSR in a place that 
 would make it difficult to use memory above 4G, but that should change at 
 some point.


But hwaddr is 32 bits, how could you call cpu_physical_memory_read()? to
a 36bits address?

Regards,

-- 
Fabien Chouteau

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Alexander Graf



Am 17.07.2013 um 10:24 schrieb Fabien Chouteau chout...@adacore.com:

 On 07/16/2013 06:54 PM, Scott Wood wrote:
 On 07/16/2013 11:15:51 AM, Fabien Chouteau wrote:
 On 07/16/2013 05:37 PM, Alexander Graf wrote:
 On 07/16/2013 05:28 PM, Fabien Chouteau wrote:
 On 07/16/2013 04:06 AM, Scott Wood wrote:
 On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
 This implementation doesn't include ring priority, TCP/IP Off-Load, QoS.
 
 Signed-off-by: Fabien Chouteauchout...@adacore.com
 From the code comments I gather this has been tested on VxWorks.  Has it
 been tested on Linux, or anywhere else?
 You're right, as I said in the cover letter, this has only been tested on 
 vxWorks.
 
 Could you please give it a try? IIRC eTSEC support should be in upstream 
 Linux.
 
 I don't have time for that. As I said in the cover letter, I submit this
 patch for those interested in eTSEC, but I won't be able to test/fix it
 for Linux.
 
 Could you please at least document more fully the known limitations, such as 
 I'm only interested in 32bits address spaces?
 
 I will, but this device is very complex and I don't even know all the
 limitation of my implementation ;)
 
 +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
 +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
 +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
 ~0x7;
 What about RBDBPH (upper bits of physical address)?  Likewise for TX.
 I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH 
 or TBDBPH.
 
 Why? I thought e500mc and above can access more than 32bits of physical 
 address space?
 
 Yes but this is not emulated by QEMU, right? sizeof (hwaddr) for
 qemu-system-ppc is 8...
 
 36bit physical is emulated by QEMU.  Currently we put CCSR in a place that 
 would make it difficult to use memory above 4G, but that should change at 
 some point.
 
 
 But hwaddr is 32 bits, how could you call cpu_physical_memory_read()? to
 a 36bits address?

8 x 8 = 64, no? :)

Alex

 
 Regards,
 
 -- 
 Fabien Chouteau

Re: [Qemu-devel] [Qemu-trivial] [PATCH] slirp: reorder include to fix FreeBSD build failure

2013-07-17 Thread Jan Kiszka

On 2013-07-14 00:35, Ed Maste wrote:
 On 13 July 2013 05:12, Michael Tokarev 
 m...@tls.msk.rumailto:m...@tls.msk.ru wrote:
 Remaining:
 
 struct mbuf {
 union M_dat {
 charm_dat_[1]; /* ANSI don't like 0 sized arrays */
 char*m_ext_;
 } M_dat;
 };
 
 #define m_dat   M_dat.m_dat_
 #define m_ext   M_dat.m_ext_
 
 This can be done by using an unnamed union, ie, by omitting
 
 Yeah, struct mbuf and those #defines date back to the beginning of BSD 
 networking.
 
 I think we're probably unconcerned with a slirp upstream at this point, so 
 such a change seems reasonable. 

Indeed, they are always welcome. There is no point in tracking upstream
anymore.

 I'm not sure that anonymous union support is universal across all compilers 
 used to build QEMU though - do you know?

No problem, as Peter already said. Please provide an according patch.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SES-DE
Corporate Competence Center Embedded Linux

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Kevin Wolf

Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.
 
 Signed-off-by: Peter Lieven p...@kamp.de

Wouldn't it be better to use bdrv_write_zeroes() and extend the
implementation of that to use discard internally in those block drivers
where it makes sense?

Because here you're not really discarding (i.e. don't care about the
sectors any more), but you want them to be zeroed.

Kevin

Re: [Qemu-devel] [Qemu-trivial] [PATCH] slirp: reorder include to fix FreeBSD build failure

2013-07-17 Thread Michael Tokarev

17.07.2013 12:32, Jan Kiszka wrote:
 No problem, as Peter already said. Please provide an according patch.

http://thread.gmane.org/gmane.comp.emulators.qemu/221949/focus=221975
 (an attachtment there)
http://git.corpit.ru/?p=qemu.git;a=commitdiff;h=2915c3b20260b1a653fced3b584d0c2b012880dc

Thanks,

/mjt

Re: [Qemu-devel] [PATCH 14/17] PPC: dbdma: Support unaligned DMA access

2013-07-17 Thread Kevin Wolf

Am 01.07.2013 um 02:13 hat Alexander Graf geschrieben:
 The DBDMA engine really just reads bytes from a producing device (IDE
 in our case) and shoves these bytes into memory. It doesn't care whether
 any alignment takes place or not.
 
 Our code today however assumes that block accesses always happen on
 sector (512 byte) boundaries. This is a fair assumption for most cases.
 
 However, Mac OS X really likes to do unaligned, incomplete accesses
 that it finishes with the next DMA request.
 
 So we need to read / write the unaligned bits independent of the actual
 asynchronous request, because that one can only handle 512-byte-aligned
 data. We also need to cache these unaligned sectors until the next DMA
 request, at which point the data might be successfully flushed from the
 pipe.
 
 Signed-off-by: Alexander Graf ag...@suse.de

 @@ -98,14 +122,40 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int 
 ret)
  
  /* launch next transfer */
  
 -MACIO_DPRINTF(io-len = %#x\n, io-len);
 +/* handle unaligned accesses first, get them over with and only do the
 +   remaining bulk transfer using our async DMA helpers */
 +unaligned = io-len  0x1ff;
 +if (unaligned) {
 +int sector_num = (s-lba  2) + (s-io_buffer_index  9);
 +int nsector = io-len  9;
  
 -s-io_buffer_size = io-len;
 +MACIO_DPRINTF(precopying unaligned %d bytes to %#lx\n,
 +  unaligned, io-addr + io-len - unaligned);
 +
 +bdrv_read(s-bs, sector_num + nsector, io-remainder, 1);

I haven't really reviewed the general logic, but not using the return
value of bdrv_read() and bdrv_write() is most definitely wrong, I/O can
fail. More instances of the same bug follow.

Kevin

Re: [Qemu-devel] [PATCH v2 00/13] dataplane: use block layer

2013-07-17 Thread Stefan Hajnoczi

On Mon, Jul 15, 2013 at 07:05:50PM +0200, Paolo Bonzini wrote:
 Il 15/07/2013 16:42, Stefan Hajnoczi ha scritto:
  v2:
   * Rebased onto qemu.git/master
   * Added comment explaining how the dataplane thread is restarted after 
  draining [pbonzini]
  
  This series adds image format, QMP 'transation', and QMP 'block_resize' 
  support
  to dataplane.  This is done by replacing custom Linux AIO code with QEMU 
  block
  layer APIs.
  
  In order for block layer APIs to be safe, bdrv_drain_all() is modified to be
  aware of device emulation threads (like dataplane).
  BlockDevOps-drain_threads_cb() is introduced as a way to stop device 
  emulation
  threads temporarily.  Device emulation threads may resume after this main 
  loop
  iteration completes, which is enough to allow code running under the QEMU
  global mutex a chance to use the block device temporarily.
  
  bdrv_get_aio_context() is dropped in favor of a thread-local AioContext
  pointer.  This allows qemu_bh_new(), qemu_aio_wait(), and friends to
  automatically use the right AioContext.  When we introduce a 
  BlockDriverState
  lock in the future, it will be possible to use block devices from multiple
  threads arbitrarily (right now we only allow it if bdrv_drain_all() is 
  used).
  
  Three open issues:
  
   * Timers only work in the main loop, so I/O throttling is ignored and QED 
  is
 unsafe with x-data-plane=on.  I am tackling this in a separate series 
  that
 makes QEMUTimer thread-safe and can then re-enable I/O throttling.
  
   * Peformance analysis in progress.
  
   * Need to test with NBD, Gluster, and other non-file protocols.
 
 Since this is for 1.6, I am still not sure this is the right approach
 (especially a block device lock scares me a bit...).

I agree that the block device lock looks difficult.  When I started in
that direction it quickly became clear that it requires a lot of effort
to solve.  It is hard because of BDS relationships and the fact that I/O
requests live across main loop iterations (they must be reentered,
either as a callback or a coroutine).

But that approach is not implemented in this patch series...

 Quoting a private mail from Kevin (I guess he doesn't mind), the
 possible approaches include:
 
 * Stopping the dataplane event loop while another user accesses the
 BlockDriverState
 
 * Make other users use Bottom Halves to do a context switch into the
 dataplane event loop and do their operation from there
 
 * Add locks the BlockDriverState and just do the accesses from different
 threads
 
 and I believe a mix of these would be the way to go.

 Stopping the dataplane event loop is not really necessary for
 correctness; that only requires stopping the ioeventfd, and indeed this
 series already includes patches for this.  So you definitely have my
 approval (in quotes because you're the maintainer...) for patches 1-2-3.

The TLS AioContext approach is compatible with the future improvements
that you mentioned.  Remember that BlockDevOps-drain_threads_cb() is
orthogonal to TLS AioContext.  We can still get at the AioContext for a
BDS, if necessary.

The point of TLS AioContext is to avoid explicitly passing AioContext
everywhere.  Since BH is sometimes used deep down it would be very
tedious to start passing around AioContext.  qemu_bh_*() should just do
the right thing.

BTW, besides stopping ioeventfd it is also necessary to either complete
pending I/O requests (drain) or to at least postpone callbacks for
pending I/O requests while another thread is accessing the BDS.

(The problem with postponing callbacks is that block drivers sometimes
have dependencies between requests, for example allocating write
requests that touch the some metadata.  Simply postponing callbacks
leads to deadlock when waiting for dependencies.  This is where locks
save the day at the cost of complexity because they allow multiple
threads to make progress safely.)

 However, I'm not sure it is feasible to have a single AioContext per
 thread.  Consider a backup job whose target is not running in the same
 AioContext as the source; for optimization it might use bdrv_aio_*
 instead of coroutine-based I/O.
 
 So, once you have stopped the ioeventfd to correctly support
 bdrv_drain_all(), I would like to see code that makes other threads use
 bottom halves to do a context switch.  Ping Fan's patches for a
 thread-safe BH list are useful for this.

If I understand correctly this is means a block job wraps I/O calls so
that they are scheduled in a remote AioContext/thread when necessary.
This is necessary when a block job touches multiple BDS which belong to
different threads.

 I think there are two ways to implement synchronous operations, there
 are two approaches:
 
 * A lock, just like Kevin mentioned (though I would place it on the
 AioContext, not the BlockDriverState).  Then you can call aio_wait under
 the lock in bdrv_read and friends.

This sounds clever.  So block jobs don't need to explicitly wrap I/O

Re: [Qemu-devel] [PATCH 0/8] Add virtio-mmio and use it in vexpress

2013-07-17 Thread Christoffer Dall

On Wed, Jul 10, 2013 at 12:56:31PM +0200, Alexander Graf wrote:
 
 On 08.07.2013, at 23:06, Anthony Liguori wrote:
 
  Alexander Graf ag...@suse.de writes:
  
  On 08.07.2013, at 22:08, Anthony Liguori wrote:
  
  I think we're trying to fit a square peg into a round hole.
  
  virtio-mmio is a virtio transport where each device has a dedicated set
  of system resources.
  
  Alex, it sounds like you want virtio-mmio-bus which would be a single
  set of system resources that implemented a virtio bus on top of it.
  
  Well, what I really want is a sysbus that behaves like PCI from a
  usability point of view ;).
  
  Which means you need to have (1) a discovery mechanism with a stable
  addressing mechanism
 
 That's what dtb usually gives you.
 
  (2) a way to communicate this to the guest from the
  host.
 
 Not if the host dictates everything. PCI is only complicated because it 
 allows the guest control. If we don't we can have a push-only interface.
 
 But I'm not sure we should hold back this patch series based on this. I can 
 try to come up with a bus that can automatically place memory regions and 
 IRQs. Then I can add a virtio-mmio-awesomebus type and show you what I mean ;)
 
 For the time being, we can live with a few statically allocated virtio 
 transports I think. As long as we don't promise the user that they're still 
 going to be there in the next version of QEMU.
 
 
I'm not familiar enough with QEMU internals to intelligently comment on
this discussion, but I do have two observations:

(1) It would be tremendously beneficial to have this patch series
merged, so people can actually start to have upstream code that supports
usable VMs on ARM, and it doesn't seem too invasive to me.

(2) About device assignment, what is the QEMU vision in terms of how it
plugs into a board?  Do we expect a mach-virt scenario where we
dynamically create assigned devices, or would we emulate some board that
actually has the peripherals that we are going to assign and emulate the
rest of the devices?

-Christoffer

[Qemu-devel] [PATCH V17 0/9] replace QEMUOptionParameter with QemuOpts parser

2013-07-17 Thread Dong Xu Wang

These patches will replace QEMUOptionParameter with QemuOpts. Change logs
please go to each patch's commit message.

Dong Xu Wang (9):
  qemu-option: add def_value_str in QemuOptDesc struct and rewrite
qemu_opts_print
  qemu-option: avoid duplication of default value in QemuOpts
  qemu-option: create four QemuOptsList related functions
  qemu-option: create some QemuOpts functons
  block: use QemuOpts support in block layer
  qapi: query-command-line-options outputs def_value_str
  qemu-option: remove QEMUOptionParameter related functions and struct
  qemu-option: make qemu_opts_del accept opts being NULL
  qemu-option: use qemu_opts_del without judging NULL

 block.c   |  96 
 block/cow.c   |  54 ++---
 block/gluster.c   |  37 ++-
 block/iscsi.c |  31 ++-
 block/qcow.c  |  69 +++---
 block/qcow2.c | 205 +
 block/qed.c   | 114 -
 block/qed.h   |   2 +-
 block/raw-posix.c |  49 ++--
 block/raw-win32.c |  31 +--
 block/raw.c   |  30 +--
 block/rbd.c   |  62 +++--
 block/sheepdog.c  |  81 ---
 block/ssh.c   |  29 ++-
 block/vdi.c   |  70 +++---
 block/vmdk.c  | 179 +++---
 block/vpc.c   |  65 +++---
 block/vvfat.c |  11 +-
 hw/core/qdev.c|   4 +-
 include/block/block.h |   5 +-
 include/block/block_int.h |   6 +-
 include/qemu/option.h |  56 ++---
 qapi-schema.json  |   8 +-
 qemu-char.c   |   4 +-
 qemu-img.c|  63 +++--
 qmp-commands.hx   |   2 +
 util/qemu-config.c|   4 +
 util/qemu-option.c| 576 +-
 28 files changed, 930 insertions(+), 1013 deletions(-)

-- 
V17 add qemu_opts_del's support when opts is NULL, and made changes based on
Eric's comments.

I tried to split PATCH 5 into small pieces, but I found it is very hard, bdrv_*
functions are all using QEMUOptionParameter or QemuOpts, if I split them, I have
to support two parsers in block.c and qemu-img.c, it will make code very hard to
read, so I leave them in one big patch, I am sorry it is realy hard to review.

Changelogs please goto each patch.

1.7.11.7

[Qemu-devel] [PATCH V17 3/9] qemu-option: create four QemuOptsList related functions

2013-07-17 Thread Dong Xu Wang

This patch creates 4 functions, count_opts_list, qemu_opts_append,
qemu_opts_free and qemu_opts_print_help, they are used in following
commits.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
v16-v17:
1) fix indentation.
2) fix typo.

v15-v16:
1) discard double-initialization.
2) use pointer directly, not g_strdup.

v12-v13:
1) simply assert that neither argument has merge_lists set.
2) drop superfluous paranthesesis around p == first.

v11-v12:
1) renmae functions.
2) fix loop styles and code styles.
3) qemu_opts_apend will not return NULL now.
4) merge_lists value is from arguments in qemu_opts_append.

v6-v7:
1) Fix typo.

v5-v6:
1) allocate enough space in append_opts_list function.

 include/qemu/option.h |  3 +++
 util/qemu-option.c| 73 +++
 2 files changed, 76 insertions(+)

diff --git a/include/qemu/option.h b/include/qemu/option.h
index 0207209..7473e2b 100644
--- a/include/qemu/option.h
+++ b/include/qemu/option.h
@@ -155,4 +155,7 @@ int qemu_opts_print(QemuOpts *opts);
 int qemu_opts_foreach(QemuOptsList *list, qemu_opts_loopfunc func, void 
*opaque,
   int abort_on_failure);
 
+QemuOptsList *qemu_opts_append(QemuOptsList *first, QemuOptsList *second);
+void qemu_opts_free(QemuOptsList *list);
+void qemu_opts_print_help(QemuOptsList *list);
 #endif
diff --git a/util/qemu-option.c b/util/qemu-option.c
index 86cc64e..3a055a7 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -1176,3 +1176,76 @@ int qemu_opts_foreach(QemuOptsList *list, 
qemu_opts_loopfunc func, void *opaque,
 loc_pop(loc);
 return rc;
 }
+
+static size_t count_opts_list(QemuOptsList *list)
+{
+size_t i;
+
+for (i = 0; list  list-desc[i].name; i++) {
+;
+}
+
+return i;
+}
+
+/* Create a new QemuOptsList with a desc of the merge of the first
+ * and second. It will allocate space for one new QemuOptsList plus
+ * enough space for QemuOptDesc in first and second QemuOptsList.
+ * First argument's QemuOptDesc members take precedence over second's.
+ * The result's name and implied_opt_name are not copied from them.
+ * Both merge_lists should not be set. Both lists can be NULL.
+ */
+QemuOptsList *qemu_opts_append(QemuOptsList *first,
+   QemuOptsList *second)
+{
+size_t num_first_opts, num_second_opts;
+QemuOptsList *dest = NULL;
+int i;
+int index = 0;
+QemuOptsList *p = first;
+
+num_first_opts = count_opts_list(first);
+num_second_opts = count_opts_list(second);
+
+dest = g_malloc0(sizeof(QemuOptsList)
++ (num_first_opts + num_second_opts + 1) * sizeof(QemuOptDesc));
+
+dest-name = append_opts_list;
+dest-implied_opt_name = NULL;
+assert((!first || !first-merge_lists)
+(!second || !second-merge_lists));
+QTAILQ_INIT(dest-head);
+
+for (i = 0; p  p-desc[i].name; i++) {
+if (!find_desc_by_name(dest-desc, p-desc[i].name)) {
+dest-desc[index].name = p-desc[i].name;
+dest-desc[index].help = p-desc[i].help;
+dest-desc[index].type = p-desc[i].type;
+dest-desc[index].def_value_str =  p-desc[i].def_value_str;
+index++;
+}
+if (p == first  p  !p-desc[i].name) {
+p = second;
+i = 0;
+}
+}
+dest-desc[index].name = NULL;
+return dest;
+}
+
+/* free a QemuOptsList, can accept NULL as arguments */
+void qemu_opts_free(QemuOptsList *list)
+{
+g_free(list);
+}
+
+void qemu_opts_print_help(QemuOptsList *list)
+{
+int i;
+printf(Supported options:\n);
+for (i = 0; list  list-desc[i].name; i++) {
+printf(%-16s %s\n, list-desc[i].name,
+   list-desc[i].help ?
+   list-desc[i].help : No description available);
+}
+}
-- 
1.7.11.7

[Qemu-devel] [PATCH V17 1/9] qemu-option: add def_value_str in QemuOptDesc struct and rewrite qemu_opts_print

2013-07-17 Thread Dong Xu Wang

qemu_opts_print has no user now, so can re-write the function safely.

qemu_opts_print is used while using qemu-img create, it
produces the same output as previous code.

The behavior of this function has changed:

1. Print every possible option, whether a value has been set or not.
2. Option descriptors may provide a default value.
3. Print to stdout instead of stderr.

Previously the behavior was to print every option that has been set.
Options that have not been set would be skipped.

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---

v13-v14:
1) fix memory leak.
2) make opt_set do not accpet null value argument.

v12-v13
1) re-write commit message.

v11-v12
1) make def_value_str become the real default value string in opt_set
function.

v10-v11:
1) print all values that have actually been assigned while accept-any
cases.

v7-v8:
1) print elements = accept any params while opts_accepts_any() ==
true.
2) since def_print_str is the default value if an option isn't set,
so rename it to def_value_str.

 include/qemu/option.h |  3 ++-
 util/qemu-option.c| 32 ++--
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/include/qemu/option.h b/include/qemu/option.h
index a83c700..0207209 100644
--- a/include/qemu/option.h
+++ b/include/qemu/option.h
@@ -94,6 +94,7 @@ typedef struct QemuOptDesc {
 const char *name;
 enum QemuOptType type;
 const char *help;
+const char *def_value_str;
 } QemuOptDesc;
 
 struct QemuOptsList {
@@ -150,7 +151,7 @@ QDict *qemu_opts_to_qdict(QemuOpts *opts, QDict *qdict);
 void qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp);
 
 typedef int (*qemu_opts_loopfunc)(QemuOpts *opts, void *opaque);
-int qemu_opts_print(QemuOpts *opts, void *dummy);
+int qemu_opts_print(QemuOpts *opts);
 int qemu_opts_foreach(QemuOptsList *list, qemu_opts_loopfunc func, void 
*opaque,
   int abort_on_failure);
 
diff --git a/util/qemu-option.c b/util/qemu-option.c
index e0ef426..be5dfaa 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -826,16 +826,36 @@ void qemu_opts_del(QemuOpts *opts)
 g_free(opts);
 }
 
-int qemu_opts_print(QemuOpts *opts, void *dummy)
+int qemu_opts_print(QemuOpts *opts)
 {
 QemuOpt *opt;
+QemuOptDesc *desc = opts-list-desc;
 
-fprintf(stderr, %s: %s:, opts-list-name,
-opts-id ? opts-id : noid);
-QTAILQ_FOREACH(opt, opts-head, next) {
-fprintf(stderr,  %s=\%s\, opt-name, opt-str);
+if (desc[0].name == NULL) {
+QTAILQ_FOREACH(opt, opts-head, next) {
+printf(%s=\%s\ , opt-name, opt-str);
+}
+return 0;
+}
+for (; desc  desc-name; desc++) {
+const char *value = desc-def_value_str;
+QemuOpt *opt;
+
+opt = qemu_opt_find(opts, desc-name);
+if (opt) {
+value = opt-str;
+}
+
+if (!value) {
+continue;
+}
+
+if (desc-type == QEMU_OPT_STRING) {
+printf(%s='%s' , desc-name, value);
+} else {
+printf(%s=%s , desc-name, value);
+}
 }
-fprintf(stderr, \n);
 return 0;
 }
 
-- 
1.7.11.7

[Qemu-devel] [PATCH V5 11/12] NUMA: add qmp command query-numa

2013-07-17 Thread Wanlong Gao

Add qmp command query-numa to show guest NUMA information.

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 numa.c   | 78 
 qapi-schema.json | 34 
 qmp-commands.hx  | 49 +++
 3 files changed, 161 insertions(+)

diff --git a/numa.c b/numa.c
index 3befc56..21fc7b9 100644
--- a/numa.c
+++ b/numa.c
@@ -433,3 +433,81 @@ error:
 numa_info[nodeid].flags = flags;
 return;
 }
+
+NUMAInfoList *qmp_query_numa(Error **errp)
+{
+NUMAInfoList *head = NULL, *cur_item = NULL;
+CPUState *cpu;
+int i;
+
+for (i = 0; i  nb_numa_nodes; i++) {
+NUMAInfoList *info;
+intList *cur_cpu_item = NULL;
+info = g_malloc0(sizeof(*info));
+info-value = g_malloc0(sizeof(*info-value));
+info-value-nodeid = i;
+for (cpu = first_cpu; cpu != NULL; cpu = cpu-next_cpu) {
+if (cpu-numa_node == i) {
+intList *node_cpu = g_malloc0(sizeof(*node_cpu));
+node_cpu-value = cpu-cpu_index;
+
+if (!cur_cpu_item) {
+info-value-cpus = cur_cpu_item = node_cpu;
+} else {
+cur_cpu_item-next = node_cpu;
+cur_cpu_item = node_cpu;
+}
+}
+}
+info-value-memory = numa_info[i].node_mem  20;
+
+#ifdef CONFIG_NUMA
+switch (numa_info[i].flags  NODE_HOST_POLICY_MASK) {
+case NODE_HOST_BIND:
+info-value-policy = g_strdup(membind);
+break;
+case NODE_HOST_INTERLEAVE:
+info-value-policy = g_strdup(interleave);
+break;
+case NODE_HOST_PREFERRED:
+info-value-policy = g_strdup(preferred);
+break;
+default:
+info-value-policy = g_strdup(defalut);
+break;
+}
+
+if (numa_info[i].flags  NODE_HOST_RELATIVE) {
+info-value-relative = true;
+}
+
+unsigned long first, next;
+next = first = find_first_bit(numa_info[i].host_mem, MAX_CPUMASK_BITS);
+intList *cur_node_item = g_malloc0(sizeof(*cur_node_item));
+cur_node_item-value = first;
+info-value-host_nodes = cur_node_item;
+do {
+if (next == numa_max_node())
+break;
+next = find_next_bit(numa_info[i].host_mem, MAX_CPUMASK_BITS,
+ next + 1);
+if (next  numa_max_node() || next == MAX_CPUMASK_BITS)
+break;
+
+intList *host_node = g_malloc0(sizeof(*host_node));
+host_node-value = next;
+cur_node_item-next = host_node;
+cur_node_item = host_node;
+} while (true);
+#endif
+
+if (!cur_item) {
+head = cur_item = info;
+} else {
+cur_item-next = info;
+cur_item = info;
+}
+}
+
+return head;
+}
diff --git a/qapi-schema.json b/qapi-schema.json
index fd4c615..085ba75 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3748,3 +3748,37 @@
 { 'command': 'set-mem-policy',
   'data': {'nodeid': 'int', '*policy': 'str',
'*host-nodes': 'str'} }
+##
+# @NUMAInfo:
+#
+# Information about guest NUMA nodes
+#
+# @nodeid: NUMA node ID
+#
+# @cpus: VCPUs contained to this node
+#
+# @memory: memory size of this node
+#
+# @policy: memory policy of this node
+#
+# @relative: if host nodes is relative for memory policy
+#
+# @host-nodes: host nodes for its memory policy
+#
+# Since: 1.6
+#
+##
+{ 'type': 'NUMAInfo',
+  'data': {'nodeid': 'int', 'cpus': ['int'], 'memory': 'int',
+   'policy': 'str', 'relative': 'bool', 'host-nodes': ['int'] }}
+
+##
+# @query-numa:
+#
+# Returns a list of information about each guest node.
+#
+# Returns: a list of @NUMAInfo for each guest node
+#
+# Since: 1.6
+##
+{ 'command': 'query-numa', 'returns': ['NUMAInfo'] }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index aac88e0..998fb00 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -3082,3 +3082,52 @@ Notes:
 2. If host-nodes is not set, the node mask of this policy will be set
to all.
 EQMP
+
+{
+.name = query-numa,
+.args_type = ,
+.mhandler.cmd_new = qmp_marshal_input_query_numa,
+},
+
+SQMP
+query-numa
+-
+
+Show NUMA information.
+
+Return a json-array. Each NUMA node is represented by a json-object,
+which contains:
+
+- nodeid: NUMA node ID (json-int)
+- cpus: a json-arry of contained VCPUs
+- memory: amount of memory in each node in MB (json-int)
+- policy: memory policy of this node (json-string)
+- relative: if host nodes is relative for its memory policy (json-bool)
+- host-nodes: a json-array of host nodes for its memory policy
+
+Arguments:
+
+Example:
+
+- { excute: query-numa }
+- { return:[
+{
+nodeid: 0,
+cpus: [0, 1],
+

[Qemu-devel] [PATCH V5 00/12] Add support for binding guest numa nodes to host numa nodes

2013-07-17 Thread Wanlong Gao

As you know, QEMU can't direct it's memory allocation now, this may cause
guest cross node access performance regression.
And, the worse thing is that if PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
=kvm_assign_device()
  = kvm_iommu_map_memslots()
= kvm_iommu_map_pages()
   = kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too.

So, we should set the guest nodes memory allocation policy before
the pages are really mapped.

According to this patch set, we are able to set guest nodes memory policy
like following:

 -numa node,nodeid=0,cpus=0, \
 -numa mem,size=1024M,policy=membind,host-nodes=0-1 \
 -numa node,nodeid=1,cpus=1 \
 -numa mem,size=1024M,policy=interleave,host-nodes=1

This supports policy={membind|interleave|preferred},host-nodes=[+|!]{all|N-N} 
like format.

And patch 9/12 adds a QMP command set-mem-policy to set the memory policy
for every guest nodes:
set-mem-policy nodeid=0 policy=membind host-nodes=0-1

And patch 10/12 adds a monitor command set-mem-policy whose format like:
set-mem-policy 0 policy=membind,host-nodes=0-1

And patch 11/12 adds a QMP command query-numa to show numa info through
this API.

And patch 12/12 converts the info numa monitor command to use this
QMP command query-numa.


V1-V2:
change to use QemuOpts in numa options (Paolo)
handle Error in mpol parser (Paolo)
change qmp command format to mem-policy=membind,mem-hostnode=0-1 like 
(Paolo)
V2-V3:
also handle Error in cpus parser (5/10)
split out common parser from cpus and hostnode parser (Bandan 6/10)
V3-V4:
rebase to request for comments
V4-V5:
use OptVisitor and split -numa option (Paolo)
 - s/set-mpol/set-mem-policy (Andreas)
 - s/mem-policy/policy
 - s/mem-hostnode/host-nodes
fix hmp command process after error (Luiz)
add qmp command query-numa and convert info numa to it (Luiz)


Wanlong Gao (12):
  NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions
  NUMA: split -numa option
  NUMA: move numa related code to numa.c
  NUMA: Add numa_info structure to contain numa nodes info
  NUMA: Add Linux libnuma detection
  NUMA: parse guest numa nodes memory policy
  NUMA: split out the common range parser
  NUMA: set guest numa nodes memory policy
  NUMA: add qmp command set-mem-policy to set memory policy for NUMA
node
  NUMA: add hmp command set-mem-policy
  NUMA: add qmp command query-numa
  NUMA: convert hmp command info_numa to use qmp command query_numa

 Makefile.target |   2 +-
 configure   |  32 +++
 cpus.c  |  14 --
 hmp-commands.hx |  16 ++
 hmp.c   |  70 +++
 hmp.h   |   2 +
 hw/i386/pc.c|   4 +-
 hw/net/eepro100.c   |   1 -
 include/sysemu/cpus.h   |   1 -
 include/sysemu/sysemu.h |  20 +-
 monitor.c   |  21 +-
 numa.c  | 513 
 qapi-schema.json| 103 ++
 qemu-options.hx |   6 +-
 qmp-commands.hx |  84 
 vl.c| 157 ++-
 16 files changed, 861 insertions(+), 185 deletions(-)
 create mode 100644 numa.c

-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH V5 07/12] NUMA: split out the common range parser

2013-07-17 Thread Wanlong Gao

Since cpus parser and hostnode parser have the common range parser
part, split it out to the common range parser to avoid the duplicate
code.

Reviewed-by: Bandan Das b...@redhat.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 numa.c | 86 --
 1 file changed, 37 insertions(+), 49 deletions(-)

diff --git a/numa.c b/numa.c
index a2eceb1..736acbb 100644
--- a/numa.c
+++ b/numa.c
@@ -36,48 +36,59 @@ QemuOptsList qemu_numa_opts = {
 .desc = { { 0 } } /* validated with OptsVisitor */
 };
 
-static int numa_node_parse_cpus(int nodenr, const char *cpus)
+static int numa_range_parse(const char *str,
+unsigned long long *value,
+unsigned long long *endvalue)
 {
 char *endptr;
-unsigned long long value, endvalue;
 
-/* Empty CPU range strings will be considered valid, they will simply
- * not set any bit in the CPU bitmap.
- */
-if (!*cpus) {
-return 0;
+if (parse_uint(str, value, endptr, 10)  0) {
+return -1;
 }
 
-if (parse_uint(cpus, value, endptr, 10)  0) {
-goto error;
-}
 if (*endptr == '-') {
-if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
-goto error;
+if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
+return -1;
 }
 } else if (*endptr == '\0') {
-endvalue = value;
+*endvalue = *value;
 } else {
-goto error;
+return -1;
 }
 
-if (endvalue = MAX_CPUMASK_BITS) {
-endvalue = MAX_CPUMASK_BITS - 1;
+if (*endvalue = MAX_CPUMASK_BITS) {
+*endvalue = MAX_CPUMASK_BITS - 1;
 fprintf(stderr,
-qemu: NUMA: A max of %d VCPUs are supported\n,
+qemu: NUMA: A max of %d VCPUs/Nodes are supported\n,
  MAX_CPUMASK_BITS);
 }
 
-if (endvalue  value) {
-goto error;
+if (*endvalue  *value) {
+return -1;
 }
 
-bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1);
 return 0;
+}
 
-error:
-fprintf(stderr, qemu: Invalid NUMA CPU range: %s\n, cpus);
-return -1;
+
+static int numa_node_parse_cpus(int nodenr, const char *cpus)
+{
+unsigned long long value, endvalue;
+
+/* Empty CPU range strings will be considered valid, they will simply
+ * not set any bit in the CPU bitmap.
+ */
+if (!*cpus) {
+return 0;
+}
+
+if (numa_range_parse(cpus, value, endvalue)  0) {
+fprintf(stderr, Invalid NUMA CPU range: %s, cpus);
+return -1;
+}
+
+bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1);
+return 0;
 }
 
 static int numa_node_parse(NumaNodeOptions *opts)
@@ -115,7 +126,6 @@ static int numa_mem_parse_policy(int nodenr, const char 
*policy)
 static int numa_mem_parse_hostnodes(int nodenr, const char *hostnodes)
 {
 unsigned long long value, endvalue;
-char *endptr;
 bool clear = false;
 unsigned long *bm = numa_info[nodenr].host_mem;
 
@@ -134,27 +144,9 @@ static int numa_mem_parse_hostnodes(int nodenr, const char 
*hostnodes)
 return 0;
 }
 
-if (parse_uint(hostnodes, value, endptr, 10)  0)
-goto error;
-if (*endptr == '-') {
-if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
-goto error;
-}
-} else if (*endptr == '\0') {
-endvalue = value;
-} else {
-goto error;
-}
-
-if (endvalue = MAX_CPUMASK_BITS) {
-endvalue = MAX_CPUMASK_BITS - 1;
-fprintf(stderr,
-qemu: NUMA: A max of %d host nodes are supported\n,
- MAX_CPUMASK_BITS);
-}
-
-if (endvalue  value) {
-goto error;
+if (numa_range_parse(hostnodes, value, endvalue)  0) {
+fprintf(stderr, Invalid host NUMA nodes range: %s, hostnodes);
+return -1;
 }
 
 if (clear)
@@ -163,10 +155,6 @@ static int numa_mem_parse_hostnodes(int nodenr, const char 
*hostnodes)
 bitmap_set(bm, value, endvalue - value + 1);
 
 return 0;
-
-error:
-fprintf(stderr, qemu: Invalid host NUMA nodes range: %s\n, hostnodes);
-return -1;
 }
 
 static int numa_mem_parse(NumaMemOptions *opts)
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH V5 06/12] NUMA: parse guest numa nodes memory policy

2013-07-17 Thread Wanlong Gao

The memory policy setting format is like:
policy={membind|interleave|preferred},host-node=[+|!]{all|N-N}
And we are adding this setting as a suboption of -numa mem,,
the memory policy then can be set like following:
-numa node,nodeid=0,cpus=0 \
-numa node,nodeid=1,cpus=1 \
-numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \
-numa mem,nodeid=1,size=1G,policy=interleave,host-nodes=!1

Reviewed-by: Bandan Das b...@redhat.com
Signed-off-by: Andre Przywara andre.przyw...@amd.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 include/sysemu/sysemu.h |  8 +
 numa.c  | 83 +
 qapi-schema.json|  8 -
 vl.c|  2 ++
 4 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 28fe305..af17c02 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -130,10 +130,18 @@ extern QEMUClock *rtc_clock;
 
 #define MAX_NODES 64
 #define MAX_CPUMASK_BITS 255
+#define NODE_HOST_NONE0x00
+#define NODE_HOST_BIND0x01
+#define NODE_HOST_INTERLEAVE  0x02
+#define NODE_HOST_PREFERRED   0x03
+#define NODE_HOST_POLICY_MASK 0x03
+#define NODE_HOST_RELATIVE0x04
 extern int nb_numa_nodes;
 typedef struct node_info {
 uint64_t node_mem;
 DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
+DECLARE_BITMAP(host_mem, MAX_CPUMASK_BITS);
+unsigned int flags;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 extern QemuOptsList qemu_numa_opts;
diff --git a/numa.c b/numa.c
index 766e111..a2eceb1 100644
--- a/numa.c
+++ b/numa.c
@@ -96,6 +96,79 @@ static int numa_node_parse(NumaNodeOptions *opts)
 return numa_node_parse_cpus(nodenr, cpus);
 }
 
+static int numa_mem_parse_policy(int nodenr, const char *policy)
+{
+if (!strcmp(policy, interleave)) {
+numa_info[nodenr].flags |= NODE_HOST_INTERLEAVE;
+} else if (!strcmp(policy, preferred)) {
+numa_info[nodenr].flags |= NODE_HOST_PREFERRED;
+} else if (!strcmp(policy, membind)) {
+numa_info[nodenr].flags |= NODE_HOST_BIND;
+} else {
+fprintf(stderr, qemu: Invalid memory policy: %s\n, policy);
+return -1;
+}
+
+return 0;
+}
+
+static int numa_mem_parse_hostnodes(int nodenr, const char *hostnodes)
+{
+unsigned long long value, endvalue;
+char *endptr;
+bool clear = false;
+unsigned long *bm = numa_info[nodenr].host_mem;
+
+if (hostnodes[0] == '!') {
+clear = true;
+bitmap_fill(bm, MAX_CPUMASK_BITS);
+hostnodes++;
+}
+if (hostnodes[0] == '+') {
+numa_info[nodenr].flags |= NODE_HOST_RELATIVE;
+hostnodes++;
+}
+
+if (!strcmp(hostnodes, all)) {
+bitmap_fill(bm, MAX_CPUMASK_BITS);
+return 0;
+}
+
+if (parse_uint(hostnodes, value, endptr, 10)  0)
+goto error;
+if (*endptr == '-') {
+if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
+goto error;
+}
+} else if (*endptr == '\0') {
+endvalue = value;
+} else {
+goto error;
+}
+
+if (endvalue = MAX_CPUMASK_BITS) {
+endvalue = MAX_CPUMASK_BITS - 1;
+fprintf(stderr,
+qemu: NUMA: A max of %d host nodes are supported\n,
+ MAX_CPUMASK_BITS);
+}
+
+if (endvalue  value) {
+goto error;
+}
+
+if (clear)
+bitmap_clear(bm, value, endvalue - value + 1);
+else
+bitmap_set(bm, value, endvalue - value + 1);
+
+return 0;
+
+error:
+fprintf(stderr, qemu: Invalid host NUMA nodes range: %s\n, hostnodes);
+return -1;
+}
+
 static int numa_mem_parse(NumaMemOptions *opts)
 {
 uint64_t nodenr, mem_size;
@@ -110,6 +183,16 @@ static int numa_mem_parse(NumaMemOptions *opts)
 mem_size = opts-size;
 numa_info[nodenr].node_mem = mem_size;
 
+const char *policy = opts-policy;
+if (numa_mem_parse_policy(nodenr, policy) == -1) {
+return -1;
+}
+
+const char *hostnodes = opts-host_nodes;
+if (numa_mem_parse_hostnodes(nodenr, hostnodes) == -1) {
+return -1;
+}
+
 return 0;
 }
 
diff --git a/qapi-schema.json b/qapi-schema.json
index f753a35..49eac70 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3717,9 +3717,15 @@
 #
 # @size: #optional memory size of this node
 #
+# @policy: #optional memory policy of this node
+#
+# @host-nodes: #optional host nodes for its memory policy
+#
 # Since 1.6
 ##
 { 'type': 'NumaMemOptions',
   'data': {
'*nodeid':  'int',
-   '*size':'size' }}
+   '*size':'size',
+   '*policy':  'str',
+   '*host-nodes':  'str' }}
diff --git a/vl.c b/vl.c
index 5fdba97..dc8131c 100644
--- a/vl.c
+++ b/vl.c
@@ -2887,6 +2887,8 @@ int main(int argc, char **argv, char **envp)
 for (i = 0; i  MAX_NODES; i++) {
 numa_info[i].node_mem = 0;
 bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS);

[Qemu-devel] [PATCH V5 03/12] NUMA: move numa related code to numa.c

2013-07-17 Thread Wanlong Gao

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 cpus.c  | 14 
 include/sysemu/cpus.h   |  1 -
 include/sysemu/sysemu.h |  1 +
 numa.c  | 59 +
 vl.c| 43 +--
 5 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/cpus.c b/cpus.c
index 8062cdd..f45fb5d 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1198,20 +1198,6 @@ static void tcg_exec_all(void)
 exit_request = 0;
 }
 
-void set_numa_modes(void)
-{
-CPUState *cpu;
-int i;
-
-for (cpu = first_cpu; cpu != NULL; cpu = cpu-next_cpu) {
-for (i = 0; i  nb_numa_nodes; i++) {
-if (test_bit(cpu-cpu_index, node_cpumask[i])) {
-cpu-numa_node = i;
-}
-}
-}
-}
-
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
 {
 /* XXX: implement xxx_cpu_list for targets that still miss it */
diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index 6502488..4f79081 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -23,7 +23,6 @@ extern int smp_threads;
 #define smp_threads 1
 #endif
 
-void set_numa_modes(void);
 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg);
 
 #endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index cf8e6e5..7acfad0 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -135,6 +135,7 @@ extern unsigned long *node_cpumask[MAX_NODES];
 extern QemuOptsList qemu_numa_opts;
 int numa_init_func(QemuOpts *opts, void *opaque);
 void set_numa_nodes(void);
+void set_numa_modes(void);
 
 #define MAX_OPTION_ROMS 16
 typedef struct QEMUOptionRom {
diff --git a/numa.c b/numa.c
index da68c4b..fca5e6f 100644
--- a/numa.c
+++ b/numa.c
@@ -162,3 +162,62 @@ error:
 return ret;
 }
 
+void set_numa_nodes(void)
+{
+if (nb_numa_nodes  0) {
+int i;
+
+if (nb_numa_nodes  MAX_NODES) {
+nb_numa_nodes = MAX_NODES;
+}
+
+/* If no memory size if given for any node, assume the default case
+ * and distribute the available memory equally across all nodes
+ */
+for (i = 0; i  nb_numa_nodes; i++) {
+if (node_mem[i] != 0)
+break;
+}
+if (i == nb_numa_nodes) {
+uint64_t usedmem = 0;
+
+/* On Linux, the each node's border has to be 8MB aligned,
+ * the final node gets the rest.
+ */
+for (i = 0; i  nb_numa_nodes - 1; i++) {
+node_mem[i] = (ram_size / nb_numa_nodes)  ~((1  23UL) - 1);
+usedmem += node_mem[i];
+}
+node_mem[i] = ram_size - usedmem;
+}
+
+for (i = 0; i  nb_numa_nodes; i++) {
+if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) {
+break;
+}
+}
+/* assigning the VCPUs round-robin is easier to implement, guest OSes
+ * must cope with this anyway, because there are BIOSes out there in
+ * real machines which also use this scheme.
+ */
+if (i == nb_numa_nodes) {
+for (i = 0; i  max_cpus; i++) {
+set_bit(i, node_cpumask[i % nb_numa_nodes]);
+}
+}
+}
+}
+
+void set_numa_modes(void)
+{
+CPUState *cpu;
+int i;
+
+for (cpu = first_cpu; cpu != NULL; cpu = cpu-next_cpu) {
+for (i = 0; i  nb_numa_nodes; i++) {
+if (test_bit(cpu-cpu_index, node_cpumask[i])) {
+cpu-numa_node = i;
+}
+}
+}
+}
diff --git a/vl.c b/vl.c
index d3e6d8c..7cea925 100644
--- a/vl.c
+++ b/vl.c
@@ -4139,48 +4139,7 @@ int main(int argc, char **argv, char **envp)
 exit(1);
 }
 
-if (nb_numa_nodes  0) {
-int i;
-
-if (nb_numa_nodes  MAX_NODES) {
-nb_numa_nodes = MAX_NODES;
-}
-
-/* If no memory size if given for any node, assume the default case
- * and distribute the available memory equally across all nodes
- */
-for (i = 0; i  nb_numa_nodes; i++) {
-if (node_mem[i] != 0)
-break;
-}
-if (i == nb_numa_nodes) {
-uint64_t usedmem = 0;
-
-/* On Linux, the each node's border has to be 8MB aligned,
- * the final node gets the rest.
- */
-for (i = 0; i  nb_numa_nodes - 1; i++) {
-node_mem[i] = (ram_size / nb_numa_nodes)  ~((1  23UL) - 1);
-usedmem += node_mem[i];
-}
-node_mem[i] = ram_size - usedmem;
-}
-
-for (i = 0; i  nb_numa_nodes; i++) {
-if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) {
-break;
-}
-}
-/* assigning the VCPUs round-robin is easier to implement, guest OSes
- * must cope with this anyway, because

[Qemu-devel] [PATCH V5 04/12] NUMA: Add numa_info structure to contain numa nodes info

2013-07-17 Thread Wanlong Gao

Add the numa_info structure to contain the numa nodes memory,
VCPUs information and the future added numa nodes host memory
policies.

Reviewed-by: Eduardo Habkost ehabk...@redhat.com
Signed-off-by: Andre Przywara andre.przyw...@amd.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 hw/i386/pc.c|  4 ++--
 hw/net/eepro100.c   |  1 -
 include/sysemu/sysemu.h |  8 ++--
 monitor.c   |  2 +-
 numa.c  | 18 +-
 vl.c|  7 +++
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c5d8570..d518876 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -654,14 +654,14 @@ static FWCfgState *bochs_bios_init(void)
 unsigned int apic_id = x86_cpu_apic_id_from_index(i);
 assert(apic_id  apic_id_limit);
 for (j = 0; j  nb_numa_nodes; j++) {
-if (test_bit(i, node_cpumask[j])) {
+if (test_bit(i, numa_info[j].node_cpu)) {
 numa_fw_cfg[apic_id + 1] = cpu_to_le64(j);
 break;
 }
 }
 }
 for (i = 0; i  nb_numa_nodes; i++) {
-numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]);
+numa_fw_cfg[apic_id_limit + 1 + i] = 
cpu_to_le64(numa_info[i].node_mem);
 }
 fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg,
  (1 + apic_id_limit + nb_numa_nodes) *
diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c
index e0befb2..3dc4937 100644
--- a/hw/net/eepro100.c
+++ b/hw/net/eepro100.c
@@ -105,7 +105,6 @@
 #define PCI_IO_SIZE 64
 #define PCI_FLASH_SIZE  (128 * KiB)
 
-#define BIT(n) (1  (n))
 #define BITS(n, m) (((0xU  (31 - n))  (31 - n + m))  m)
 
 /* The SCB accepts the following controls for the Tx and Rx units: */
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 7acfad0..28fe305 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -9,6 +9,7 @@
 #include qapi-types.h
 #include qemu/notify.h
 #include qemu/main-loop.h
+#include qemu/bitmap.h
 
 /* vl.c */
 
@@ -130,8 +131,11 @@ extern QEMUClock *rtc_clock;
 #define MAX_NODES 64
 #define MAX_CPUMASK_BITS 255
 extern int nb_numa_nodes;
-extern uint64_t node_mem[MAX_NODES];
-extern unsigned long *node_cpumask[MAX_NODES];
+typedef struct node_info {
+uint64_t node_mem;
+DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
+} NodeInfo;
+extern NodeInfo numa_info[MAX_NODES];
 extern QemuOptsList qemu_numa_opts;
 int numa_init_func(QemuOpts *opts, void *opaque);
 void set_numa_nodes(void);
diff --git a/monitor.c b/monitor.c
index 2ba7876..566709a 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1818,7 +1818,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict)
 }
 monitor_printf(mon, \n);
 monitor_printf(mon, node %d size: % PRId64  MB\n, i,
-node_mem[i]  20);
+numa_info[i].node_mem  20);
 }
 }
 
diff --git a/numa.c b/numa.c
index fca5e6f..766e111 100644
--- a/numa.c
+++ b/numa.c
@@ -72,7 +72,7 @@ static int numa_node_parse_cpus(int nodenr, const char *cpus)
 goto error;
 }
 
-bitmap_set(node_cpumask[nodenr], value, endvalue-value+1);
+bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1);
 return 0;
 
 error:
@@ -108,7 +108,7 @@ static int numa_mem_parse(NumaMemOptions *opts)
 }
 
 mem_size = opts-size;
-node_mem[nodenr] = mem_size;
+numa_info[nodenr].node_mem = mem_size;
 
 return 0;
 }
@@ -175,7 +175,7 @@ void set_numa_nodes(void)
  * and distribute the available memory equally across all nodes
  */
 for (i = 0; i  nb_numa_nodes; i++) {
-if (node_mem[i] != 0)
+if (numa_info[i].node_mem != 0)
 break;
 }
 if (i == nb_numa_nodes) {
@@ -185,14 +185,14 @@ void set_numa_nodes(void)
  * the final node gets the rest.
  */
 for (i = 0; i  nb_numa_nodes - 1; i++) {
-node_mem[i] = (ram_size / nb_numa_nodes)  ~((1  23UL) - 1);
-usedmem += node_mem[i];
+numa_info[i].node_mem = (ram_size / nb_numa_nodes)  ~((1  
23UL) - 1);
+usedmem += numa_info[i].node_mem;
 }
-node_mem[i] = ram_size - usedmem;
+numa_info[i].node_mem = ram_size - usedmem;
 }
 
 for (i = 0; i  nb_numa_nodes; i++) {
-if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) {
+if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
 break;
 }
 }
@@ -202,7 +202,7 @@ void set_numa_nodes(void)
  */
 if (i == nb_numa_nodes) {
 for (i = 0; i  max_cpus; i++) {
-set_bit(i, node_cpumask[i % nb_numa_nodes]);
+set_bit(i, numa_info[i % nb_numa_nodes].node_cpu);
 }
 }
 }
@@ -215,7 +215,7 @@ void set_numa_modes(void)

[Qemu-devel] [PATCH V17 9/9] qemu-option: use qemu_opts_del without judging NULL

2013-07-17 Thread Dong Xu Wang

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 block.c| 8 ++--
 hw/core/qdev.c | 4 +---
 qemu-char.c| 4 +---
 qemu-img.c | 4 +---
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/block.c b/block.c
index e1d1c3d..e700593 100644
--- a/block.c
+++ b/block.c
@@ -408,9 +408,7 @@ int bdrv_create(BlockDriver *drv, const char* filename, 
QemuOpts *opts)
 ret = cco.ret;
 
 out:
-if (!opts) {
-qemu_opts_del(cco.opts);
-}
+qemu_opts_del(cco.opts);
 g_free(cco.filename);
 return ret;
 }
@@ -4575,9 +4573,7 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
 }
 
 out:
-if (opts) {
-qemu_opts_del(opts);
-}
+qemu_opts_del(opts);
 qemu_opts_free(create_opts);
 
 if (bs) {
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 9190a7e..2a33eca 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -768,9 +768,7 @@ static void device_initfn(Object *obj)
 static void device_finalize(Object *obj)
 {
 DeviceState *dev = DEVICE(obj);
-if (dev-opts) {
-qemu_opts_del(dev-opts);
-}
+qemu_opts_del(dev-opts);
 }
 
 static void device_class_base_init(ObjectClass *class, void *data)
diff --git a/qemu-char.c b/qemu-char.c
index 800d6a6..6cd650d 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -3379,9 +3379,7 @@ void qemu_chr_delete(CharDriverState *chr)
 }
 g_free(chr-filename);
 g_free(chr-label);
-if (chr-opts) {
-qemu_opts_del(chr-opts);
-}
+qemu_opts_del(chr-opts);
 g_free(chr);
 }
 
diff --git a/qemu-img.c b/qemu-img.c
index 3f1b6ee..44dd07d 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1527,9 +1527,7 @@ static int img_convert(int argc, char **argv)
 out:
 qemu_progress_end();
 qemu_opts_free(create_opts);
-if (opts) {
-qemu_opts_del(opts);
-}
+qemu_opts_del(opts);
 qemu_vfree(buf);
 if (out_bs) {
 bdrv_delete(out_bs);
-- 
1.7.11.7

[Qemu-devel] [PATCH V5 05/12] NUMA: Add Linux libnuma detection

2013-07-17 Thread Wanlong Gao

Add detection of libnuma (mostly contained in the numactl package)
to the configure script. Can be enabled or disabled on the command line,
default is use if available.

Signed-off-by: Andre Przywara andre.przyw...@amd.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 configure | 32 
 1 file changed, 32 insertions(+)

diff --git a/configure b/configure
index cb0f870..7fb14aa 100755
--- a/configure
+++ b/configure
@@ -242,6 +242,7 @@ gtk=
 gtkabi=2.0
 tpm=no
 libssh2=
+numa=
 
 # parse CC options first
 for opt do
@@ -944,6 +945,10 @@ for opt do
   ;;
   --enable-libssh2) libssh2=yes
   ;;
+  --disable-numa) numa=no
+  ;;
+  --enable-numa) numa=yes
+  ;;
   *) echo ERROR: unknown option $opt; show_help=yes
   ;;
   esac
@@ -1158,6 +1163,8 @@ echo   --gcov=GCOV  use specified gcov 
[$gcov_tool]
 echo   --enable-tpm enable TPM support
 echo   --disable-libssh2disable ssh block device support
 echo   --enable-libssh2 enable ssh block device support
+echo   --disable-numa   disable libnuma support
+echo   --enable-numaenable libnuma support
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
@@ -2389,6 +2396,27 @@ EOF
 fi
 
 ##
+# libnuma probe
+
+if test $numa != no ; then
+  numa=no
+  cat  $TMPC  EOF
+#include numa.h
+int main(void) { return numa_available(); }
+EOF
+
+  if compile_prog  -lnuma ; then
+numa=yes
+libs_softmmu=-lnuma $libs_softmmu
+  else
+if test $numa = yes ; then
+  feature_not_found linux NUMA (install numactl?)
+fi
+numa=no
+  fi
+fi
+
+##
 # linux-aio probe
 
 if test $linux_aio != no ; then
@@ -3587,6 +3615,7 @@ echo TPM support   $tpm
 echo libssh2 support   $libssh2
 echo TPM passthrough   $tpm_passthrough
 echo QOM debugging $qom_cast_debug
+echo NUMA host support $numa
 
 if test $sdl_too_old = yes; then
 echo - Your SDL version is too old - please upgrade to have SDL support
@@ -3620,6 +3649,9 @@ echo extra_cflags=$EXTRA_CFLAGS  $config_host_mak
 echo extra_ldflags=$EXTRA_LDFLAGS  $config_host_mak
 echo qemu_localedir=$qemu_localedir  $config_host_mak
 echo libs_softmmu=$libs_softmmu  $config_host_mak
+if test $numa = yes; then
+  echo CONFIG_NUMA=y  $config_host_mak
+fi
 
 echo ARCH=$ARCH  $config_host_mak
 
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH V5 08/12] NUMA: set guest numa nodes memory policy

2013-07-17 Thread Wanlong Gao

Set the guest numa nodes memory policies using the mbind(2)
system call node by node.
After this patch, we are able to set guest nodes memory policies
through the QEMU options, this arms to solve the guest cross
nodes memory access performance issue.
And as you all know, if PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
=kvm_assign_device()
  = kvm_iommu_map_memslots()
= kvm_iommu_map_pages()
   = kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too.

So, we should set the guest nodes memory allocation policies before
the pages are really mapped.

Signed-off-by: Andre Przywara andre.przyw...@amd.com
Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 numa.c | 86 ++
 1 file changed, 86 insertions(+)

diff --git a/numa.c b/numa.c
index 736acbb..24c886b 100644
--- a/numa.c
+++ b/numa.c
@@ -28,6 +28,16 @@
 #include qapi-visit.h
 #include qapi/opts-visitor.h
 #include qapi/dealloc-visitor.h
+#include exec/memory.h
+
+#ifdef CONFIG_NUMA
+#include numa.h
+#include numaif.h
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1  14)
+#define MPOL_F_STATIC_NODES   (1  15)
+#endif
+#endif
 
 QemuOptsList qemu_numa_opts = {
 .name = numa,
@@ -279,6 +289,75 @@ void set_numa_nodes(void)
 }
 }
 
+#ifdef CONFIG_NUMA
+static int node_parse_bind_mode(unsigned int nodeid)
+{
+int bind_mode;
+
+switch (numa_info[nodeid].flags  NODE_HOST_POLICY_MASK) {
+case NODE_HOST_BIND:
+bind_mode = MPOL_BIND;
+break;
+case NODE_HOST_INTERLEAVE:
+bind_mode = MPOL_INTERLEAVE;
+break;
+case NODE_HOST_PREFERRED:
+bind_mode = MPOL_PREFERRED;
+break;
+default:
+bind_mode = MPOL_DEFAULT;
+return bind_mode;
+}
+
+bind_mode |= (numa_info[nodeid].flags  NODE_HOST_RELATIVE) ?
+MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+return bind_mode;
+}
+#endif
+
+static int set_node_mem_policy(unsigned int nodeid)
+{
+#ifdef CONFIG_NUMA
+void *ram_ptr;
+RAMBlock *block;
+ram_addr_t len, ram_offset = 0;
+int bind_mode;
+int i;
+
+QTAILQ_FOREACH(block, ram_list.blocks, next) {
+if (!strcmp(block-mr-name, pc.ram)) {
+break;
+}
+}
+
+if (block-host == NULL)
+return -1;
+
+ram_ptr = block-host;
+for (i = 0; i  nodeid; i++) {
+len = numa_info[i].node_mem;
+ram_offset += len;
+}
+
+len = numa_info[i].node_mem;
+bind_mode = node_parse_bind_mode(i);
+
+/* This is a workaround for a long standing bug in Linux'
+ * mbind implementation, which cuts off the last specified
+ * node. To stay compatible should this bug be fixed, we
+ * specify one more node and zero this one out.
+ */
+clear_bit(numa_num_configured_nodes() + 1, numa_info[i].host_mem);
+if (mbind(ram_ptr + ram_offset, len, bind_mode,
+numa_info[i].host_mem, numa_num_configured_nodes() + 1, 0)) {
+perror(mbind);
+return -1;
+}
+#endif
+return 0;
+}
+
 void set_numa_modes(void)
 {
 CPUState *cpu;
@@ -291,4 +370,11 @@ void set_numa_modes(void)
 }
 }
 }
+
+for (i = 0; i  nb_numa_nodes; i++) {
+if (set_node_mem_policy(i) == -1) {
+fprintf(stderr,
+qemu: can't set host memory policy for node%d\n, i);
+}
+}
 }
-- 
1.8.3.2.634.g7a3187e

Re: [Qemu-devel] [PATCH 0/8] Add virtio-mmio and use it in vexpress

2013-07-17 Thread Peter Maydell

On 17 July 2013 10:30, Christoffer Dall christoffer.d...@linaro.org wrote:
 I'm not familiar enough with QEMU internals to intelligently comment on
 this discussion, but I do have two observations:

 (1) It would be tremendously beneficial to have this patch series
 merged, so people can actually start to have upstream code that supports
 usable VMs on ARM, and it doesn't seem too invasive to me.

Well, v3 of the patchset rather than this one, but yes.
Anthony -- are you going to apply it directly (at some point,
v3 has only been on list a day or so) as virtio maintainer,
or should I put it in via arm-devs?

thanks
-- PMM

[Qemu-devel] [PATCH 3/3] dataplane: enable virtio-blk x-data-plane=on live migration

2013-07-17 Thread Stefan Hajnoczi

Although the dataplane thread does not cooperate with dirty memory
logging yet it's fairly easy to temporarily disable dataplane during
live migration.  This way virtio-blk can live migrate when
x-data-plane=on.

The dataplane thread will restart after migration is cancelled or if the
guest resuming virtio-blk operation after migration completes.

Signed-off-by: Stefan Hajnoczi stefa...@redhat.com
---
 hw/block/dataplane/virtio-blk.c | 17 -
 hw/block/virtio-blk.c   | 32 
 include/hw/virtio/virtio-blk.h  |  1 +
 3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 2faed43..411becc 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -18,7 +18,6 @@
 #include qemu/error-report.h
 #include hw/virtio/dataplane/vring.h
 #include ioq.h
-#include migration/migration.h
 #include block/block.h
 #include hw/virtio/virtio-blk.h
 #include virtio-blk.h
@@ -69,8 +68,6 @@ struct VirtIOBlockDataPlane {
  queue */
 
 unsigned int num_reqs;
-
-Error *migration_blocker;
 };
 
 /* Raise an interrupt to signal guest, if necessary */
@@ -418,6 +415,14 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, 
VirtIOBlkConf *blk,
 return false;
 }
 
+/* If dataplane is (re-)enabled while the guest is running there could be
+ * block jobs that can conflict.
+ */
+if (bdrv_in_use(blk-conf.bs)) {
+error_report(cannot start dataplane thread while device is in use);
+return false;
+}
+
 fd = raw_get_aio_fd(blk-conf.bs);
 if (fd  0) {
 error_report(drive is incompatible with x-data-plane, 
@@ -433,10 +438,6 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, 
VirtIOBlkConf *blk,
 /* Prevent block operations that conflict with data plane thread */
 bdrv_set_in_use(blk-conf.bs, 1);
 
-error_setg(s-migration_blocker,
-x-data-plane does not support migration);
-migrate_add_blocker(s-migration_blocker);
-
 *dataplane = s;
 return true;
 }
@@ -448,8 +449,6 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
 }
 
 virtio_blk_data_plane_stop(s);
-migrate_del_blocker(s-migration_blocker);
-error_free(s-migration_blocker);
 bdrv_set_in_use(s-blk-conf.bs, 0);
 g_free(s);
 }
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index cf12469..cca0c77 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -19,6 +19,7 @@
 #include hw/virtio/virtio-blk.h
 #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
 # include dataplane/virtio-blk.h
+# include migration/migration.h
 #endif
 #include block/scsi.h
 #ifdef __linux__
@@ -628,6 +629,34 @@ void virtio_blk_set_conf(DeviceState *dev, VirtIOBlkConf 
*blk)
 memcpy((s-blk), blk, sizeof(struct VirtIOBlkConf));
 }
 
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+/* Disable dataplane thread during live migration since it does not
+ * update the dirty memory bitmap yet.
+ */
+static void virtio_blk_migration_state_changed(Notifier *notifier, void *data)
+{
+VirtIOBlock *s = container_of(notifier, VirtIOBlock,
+  migration_state_notifier);
+MigrationState *mig = data;
+
+if (migration_is_active(mig)) {
+if (!s-dataplane) {
+return;
+}
+virtio_blk_data_plane_destroy(s-dataplane);
+s-dataplane = NULL;
+} else if (migration_has_finished(mig) ||
+   migration_has_failed(mig)) {
+if (s-dataplane) {
+return;
+}
+bdrv_drain_all(); /* complete in-flight non-dataplane requests */
+virtio_blk_data_plane_create(VIRTIO_DEVICE(s), s-blk,
+ s-dataplane);
+}
+}
+#endif /* CONFIG_VIRTIO_BLK_DATA_PLANE */
+
 static int virtio_blk_device_init(VirtIODevice *vdev)
 {
 DeviceState *qdev = DEVICE(vdev);
@@ -664,6 +693,8 @@ static int virtio_blk_device_init(VirtIODevice *vdev)
 virtio_cleanup(vdev);
 return -1;
 }
+s-migration_state_notifier.notify = virtio_blk_migration_state_changed;
+add_migration_state_change_notifier(s-migration_state_notifier);
 #endif
 
 s-change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
@@ -683,6 +714,7 @@ static int virtio_blk_device_exit(DeviceState *dev)
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VirtIOBlock *s = VIRTIO_BLK(dev);
 #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+remove_migration_state_change_notifier(s-migration_state_notifier);
 virtio_blk_data_plane_destroy(s-dataplane);
 s-dataplane = NULL;
 #endif
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index fc71853..b87cf49 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -125,6 +125,7 @@ typedef struct VirtIOBlock {
 unsigned short sector_mask;
 VMChangeStateEntry

[Qemu-devel] [PATCH V5 09/12] NUMA: add qmp command set-mem-policy to set memory policy for NUMA node

2013-07-17 Thread Wanlong Gao

This QMP command allows user set guest node's memory policy
through the QMP protocol. The qmp-shell command is like:
set-mem-policy nodeid=0 policy=membind host-nodes=0-1

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 numa.c   | 55 +++
 qapi-schema.json | 19 +++
 qmp-commands.hx  | 35 +++
 3 files changed, 109 insertions(+)

diff --git a/numa.c b/numa.c
index 24c886b..3befc56 100644
--- a/numa.c
+++ b/numa.c
@@ -29,6 +29,7 @@
 #include qapi/opts-visitor.h
 #include qapi/dealloc-visitor.h
 #include exec/memory.h
+#include qmp-commands.h
 
 #ifdef CONFIG_NUMA
 #include numa.h
@@ -378,3 +379,57 @@ void set_numa_modes(void)
 }
 }
 }
+
+void qmp_set_mem_policy(int64_t nodeid, bool has_policy, const char *policy,
+bool has_host_nodes, const char *host_nodes, Error 
**errp)
+{
+unsigned int flags;
+DECLARE_BITMAP(host_mem, MAX_CPUMASK_BITS);
+
+if (nodeid = nb_numa_nodes) {
+error_setg(errp, Only has '%d' NUMA nodes, nb_numa_nodes);
+return;
+}
+
+bitmap_copy(host_mem, numa_info[nodeid].host_mem, MAX_CPUMASK_BITS);
+flags = numa_info[nodeid].flags;
+
+numa_info[nodeid].flags = NODE_HOST_NONE;
+bitmap_zero(numa_info[nodeid].host_mem, MAX_CPUMASK_BITS);
+
+if (!has_policy) {
+if (set_node_mem_policy(nodeid) == -1) {
+error_setg(errp, Failed to set memory policy for node%lu, 
nodeid);
+goto error;
+}
+return;
+}
+
+if (numa_mem_parse_policy(nodeid, policy) == -1) {
+error_setg(errp, Failed to parse memory policy: %s, policy);
+goto error;
+}
+
+if (!has_host_nodes) {
+bitmap_fill(numa_info[nodeid].host_mem, MAX_CPUMASK_BITS);
+}
+
+if (host_nodes) {
+if (numa_mem_parse_hostnodes(nodeid, host_nodes) == -1) {
+error_setg(errp, Failed to parse host nodes range %s, 
host_nodes);
+goto error;
+}
+}
+
+if (set_node_mem_policy(nodeid) == -1) {
+error_setg(errp, Failed to set memory policy for node%lu, nodeid);
+goto error;
+}
+
+return;
+
+error:
+bitmap_copy(numa_info[nodeid].host_mem, host_mem, MAX_CPUMASK_BITS);
+numa_info[nodeid].flags = flags;
+return;
+}
diff --git a/qapi-schema.json b/qapi-schema.json
index 49eac70..fd4c615 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3729,3 +3729,22 @@
'*size':'size',
'*policy':  'str',
'*host-nodes':  'str' }}
+
+##
+# @set-mem-policy:
+#
+# Set the host memory binding policy for guest NUMA node.
+#
+# @nodeid: The node ID of guest NUMA node to set memory policy to.
+#
+# @policy: #optional The memory policy to be set (default 'default').
+#
+# @host-nodes: #optional The host nodes range for memory policy.
+#
+# Returns: Nothing on success
+#
+# Since: 1.6
+##
+{ 'command': 'set-mem-policy',
+  'data': {'nodeid': 'int', '*policy': 'str',
+   '*host-nodes': 'str'} }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e075df4..aac88e0 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -3047,3 +3047,38 @@ Example:
 - { return: {} }
 
 EQMP
+
+{
+.name  = set-mem-policy,
+.args_type = nodeid:i,policy:s?,host-nodes:s?,
+.help  = Set the host memory binding policy for guest NUMA node,
+.mhandler.cmd_new = qmp_marshal_input_set_mem_policy,
+},
+
+SQMP
+set-mem-policy
+--
+
+Set the host memory binding policy for guest NUMA node
+
+Arguments:
+
+- nodeid: The nodeid of guest NUMA node to set memory policy to.
+(json-int)
+- policy: The memory policy string to set.
+(json-string, optional)
+- host-nodes: The host nodes contained to this memory policy.
+(json-string, optional)
+
+Example:
+
+- { execute: set-mem-policy, arguments: { nodeid: 0, policy: 
membind,
+ host-nodes: 0-1 }}
+- { return: {} }
+
+Notes:
+1. If policy is not set, the memory policy of this nodeid will be set
+   to default.
+2. If host-nodes is not set, the node mask of this policy will be set
+   to all.
+EQMP
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH V5 12/12] NUMA: convert hmp command info_numa to use qmp command query_numa

2013-07-17 Thread Wanlong Gao

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 hmp.c | 34 ++
 hmp.h |  1 +
 monitor.c | 21 +
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/hmp.c b/hmp.c
index 7798339..66c2fef 100644
--- a/hmp.c
+++ b/hmp.c
@@ -24,6 +24,7 @@
 #include ui/console.h
 #include block/qapi.h
 #include qemu-io.h
+#include sysemu/sysemu.h
 
 static void hmp_handle_error(Monitor *mon, Error **errp)
 {
@@ -1546,3 +1547,36 @@ void hmp_set_mem_policy(Monitor *mon, const QDict *qdict)
 qmp_set_mem_policy(nodeid, has_policy, policy, has_host_nodes, host_nodes, 
local_err);
 hmp_handle_error(mon, local_err);
 }
+
+void hmp_info_numa(Monitor *mon, const QDict *qdict)
+{
+NUMAInfoList *node_list, *node;
+intList *head;
+int nodeid;
+
+node_list = qmp_query_numa(NULL);
+
+monitor_printf(mon, %d nodes\n, nb_numa_nodes);
+for (node = node_list; node; node = node-next) {
+nodeid = node-value-nodeid;
+monitor_printf(mon, node %d cpus:, nodeid);
+head = node-value-cpus;
+for (head = node-value-cpus; head != NULL; head = head-next) {
+monitor_printf(mon,  %d, (int)head-value);
+}
+monitor_printf(mon, \n);
+monitor_printf(mon, node %d size: % PRId64  MB\n,
+   nodeid, node-value-memory);
+monitor_printf(mon, node %d policy: %s\n,
+   nodeid, node-value-policy ? :  );
+monitor_printf(mon, node %d relative: %s\n, nodeid,
+   node-value-relative ? true : false);
+monitor_printf(mon, node %d host-nodes:, nodeid);
+for (head = node-value-host_nodes; head != NULL; head = head-next) {
+monitor_printf(mon,  %d, (int)head-value);
+}
+monitor_printf(mon, \n);
+}
+
+qapi_free_NUMAInfoList(node_list);
+}
diff --git a/hmp.h b/hmp.h
index ae09525..56a5efd 100644
--- a/hmp.h
+++ b/hmp.h
@@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict);
 void hmp_info_pci(Monitor *mon, const QDict *qdict);
 void hmp_info_block_jobs(Monitor *mon, const QDict *qdict);
 void hmp_info_tpm(Monitor *mon, const QDict *qdict);
+void hmp_info_numa(Monitor *mon, const QDict *qdict);
 void hmp_quit(Monitor *mon, const QDict *qdict);
 void hmp_stop(Monitor *mon, const QDict *qdict);
 void hmp_system_reset(Monitor *mon, const QDict *qdict);
diff --git a/monitor.c b/monitor.c
index 566709a..87133fc 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1803,25 +1803,6 @@ static void do_info_mtree(Monitor *mon, const QDict 
*qdict)
 mtree_info((fprintf_function)monitor_printf, mon);
 }
 
-static void do_info_numa(Monitor *mon, const QDict *qdict)
-{
-int i;
-CPUState *cpu;
-
-monitor_printf(mon, %d nodes\n, nb_numa_nodes);
-for (i = 0; i  nb_numa_nodes; i++) {
-monitor_printf(mon, node %d cpus:, i);
-for (cpu = first_cpu; cpu != NULL; cpu = cpu-next_cpu) {
-if (cpu-numa_node == i) {
-monitor_printf(mon,  %d, cpu-cpu_index);
-}
-}
-monitor_printf(mon, \n);
-monitor_printf(mon, node %d size: % PRId64  MB\n, i,
-numa_info[i].node_mem  20);
-}
-}
-
 #ifdef CONFIG_PROFILER
 
 int64_t qemu_time;
@@ -2589,7 +2570,7 @@ static mon_cmd_t info_cmds[] = {
 .args_type  = ,
 .params = ,
 .help   = show NUMA information,
-.mhandler.cmd = do_info_numa,
+.mhandler.cmd = hmp_info_numa,
 },
 {
 .name   = usb,
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH 2/3] migration: notify migration state before starting thread

2013-07-17 Thread Stefan Hajnoczi

The migration thread runs outside the QEMU global mutex when possible.
Therefore we must notify migration state change *before* starting the
migration thread.

This allows registered listeners to act before live migration iterations
begin.  Therefore they can get into a state that allows for live
migration.  When the migration thread starts everything will be ready.

Signed-off-by: Stefan Hajnoczi stefa...@redhat.com
---
 migration.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/migration.c b/migration.c
index 9f5a423..b4daf13 100644
--- a/migration.c
+++ b/migration.c
@@ -625,7 +625,9 @@ void migrate_fd_connect(MigrationState *s)
 qemu_file_set_rate_limit(s-file,
  s-bandwidth_limit / XFER_LIMIT_RATIO);
 
+/* Notify before starting migration thread */
+notifier_list_notify(migration_state_notifiers, s);
+
 qemu_thread_create(s-thread, migration_thread, s,
QEMU_THREAD_JOINABLE);
-notifier_list_notify(migration_state_notifiers, s);
 }
-- 
1.8.1.4

[Qemu-devel] [PATCH V5 10/12] NUMA: add hmp command set-mem-policy

2013-07-17 Thread Wanlong Gao

Add hmp command set-mem-policy to set host memory policy for a guest
NUMA node. Then we can also set node's memory policy using
the monitor command like:
(qemu) set-mem-policy 0 policy=membind,host-nodes=0-1

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 hmp-commands.hx | 16 
 hmp.c   | 36 
 hmp.h   |  1 +
 3 files changed, 53 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 8c6b91a..fe3a26f 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1587,6 +1587,22 @@ Executes a qemu-io command on the given block device.
 ETEXI
 
 {
+.name   = set-mem-policy,
+.args_type  = nodeid:i,args:s?,
+.params = nodeid [args],
+.help   = set host memory policy for a guest NUMA node,
+.mhandler.cmd = hmp_set_mem_policy,
+},
+
+STEXI
+@item set-mem-policy @var{nodeid} @var{args}
+@findex set-mem-policy
+
+Set host memory policy for a guest NUMA node
+
+ETEXI
+
+{
 .name   = info,
 .args_type  = item:s?,
 .params = [subcommand],
diff --git a/hmp.c b/hmp.c
index dc4d8d4..7798339 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1510,3 +1510,39 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict)
 
 hmp_handle_error(mon, err);
 }
+
+void hmp_set_mem_policy(Monitor *mon, const QDict *qdict)
+{
+Error *local_err = NULL;
+bool has_policy = true;
+bool has_host_nodes = true;
+const char *policy = NULL;
+const char *host_nodes = NULL;
+QemuOpts *opts;
+
+uint64_t nodeid = qdict_get_int(qdict, nodeid);
+const char *args = qdict_get_try_str(qdict, args);
+
+if (args == NULL) {
+has_policy = false;
+has_host_nodes = false;
+} else {
+opts = qemu_opts_parse(qemu_find_opts(numa), args, 1);
+if (opts == NULL) {
+monitor_printf(mon, Parsing memory policy args failed\n);
+return;
+} else {
+policy = qemu_opt_get(opts, policy);
+if (policy == NULL) {
+has_policy = false;
+}
+host_nodes = qemu_opt_get(opts, hostnode);
+if (host_nodes == NULL) {
+has_host_nodes = false;
+}
+}
+}
+
+qmp_set_mem_policy(nodeid, has_policy, policy, has_host_nodes, host_nodes, 
local_err);
+hmp_handle_error(mon, local_err);
+}
diff --git a/hmp.h b/hmp.h
index 6c3bdcd..ae09525 100644
--- a/hmp.h
+++ b/hmp.h
@@ -87,5 +87,6 @@ void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
 void hmp_chardev_add(Monitor *mon, const QDict *qdict);
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
 void hmp_qemu_io(Monitor *mon, const QDict *qdict);
+void hmp_set_mem_policy(Monitor *mon, const QDict *qdict);
 
 #endif
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PATCH V17 4/9] qemu-option: create some QemuOpts functons

2013-07-17 Thread Dong Xu Wang

These functions will be used in next commit.

qemu_opt_get_(*)_del functions are used to make sure we
have the same behaviors as before: in block layer, after
parsing a parameter value, parameter list will delete it
to avoid parsing it twice.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com

---
v16-v17:
1) return char * instead of const char*

v13-v14:
1) rewrite commit message.
2) use def_value_str in qemu_opt_get_FOO_del() and qemu_opt_get_del().
3) delete redundant qemu_opt_del(opt).
 include/qemu/option.h |  11 -
 util/qemu-option.c| 130 --
 2 files changed, 125 insertions(+), 16 deletions(-)

diff --git a/include/qemu/option.h b/include/qemu/option.h
index 7473e2b..12fa8b4 100644
--- a/include/qemu/option.h
+++ b/include/qemu/option.h
@@ -106,6 +106,7 @@ struct QemuOptsList {
 };
 
 const char *qemu_opt_get(QemuOpts *opts, const char *name);
+char *qemu_opt_get_del(QemuOpts *opts, const char *name);
 /**
  * qemu_opt_has_help_opt:
  * @opts: options to search for a help request
@@ -119,13 +120,18 @@ const char *qemu_opt_get(QemuOpts *opts, const char 
*name);
  */
 bool qemu_opt_has_help_opt(QemuOpts *opts);
 bool qemu_opt_get_bool(QemuOpts *opts, const char *name, bool defval);
+bool qemu_opt_get_bool_del(QemuOpts *opts, const char *name, bool defval);
 uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t 
defval);
 uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
+uint64_t qemu_opt_get_size_del(QemuOpts *opts, const char *name,
+   uint64_t defval);
 int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
+int qemu_opt_replace_set(QemuOpts *opts, const char *name, const char *value);
 void qemu_opt_set_err(QemuOpts *opts, const char *name, const char *value,
   Error **errp);
 int qemu_opt_set_bool(QemuOpts *opts, const char *name, bool val);
 int qemu_opt_set_number(QemuOpts *opts, const char *name, int64_t val);
+int qemu_opt_replace_set_number(QemuOpts *opts, const char *name, int64_t val);
 typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void 
*opaque);
 int qemu_opt_foreach(QemuOpts *opts, qemu_opt_loopfunc func, void *opaque,
  int abort_on_failure);
@@ -142,7 +148,10 @@ const char *qemu_opts_id(QemuOpts *opts);
 void qemu_opts_del(QemuOpts *opts);
 void qemu_opts_validate(QemuOpts *opts, const QemuOptDesc *desc, Error **errp);
 int qemu_opts_do_parse(QemuOpts *opts, const char *params, const char 
*firstname);
-QemuOpts *qemu_opts_parse(QemuOptsList *list, const char *params, int 
permit_abbrev);
+int qemu_opts_do_parse_replace(QemuOpts *opts, const char *params,
+   const char *firstname);
+QemuOpts *qemu_opts_parse(QemuOptsList *list, const char *params,
+  int permit_abbrev);
 void qemu_opts_set_defaults(QemuOptsList *list, const char *params,
 int permit_abbrev);
 QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict,
diff --git a/util/qemu-option.c b/util/qemu-option.c
index 3a055a7..41f5b0f 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -523,6 +523,32 @@ const char *qemu_opt_get(QemuOpts *opts, const char *name)
 return opt ? opt-str : NULL;
 }
 
+static void qemu_opt_del(QemuOpt *opt)
+{
+QTAILQ_REMOVE(opt-opts-head, opt, next);
+g_free((/* !const */ char*)opt-name);
+g_free((/* !const */ char*)opt-str);
+g_free(opt);
+}
+
+char *qemu_opt_get_del(QemuOpts *opts, const char *name)
+{
+QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+char *str = NULL;
+
+if (!opt) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+str = g_strdup(desc-def_value_str);
+}
+} else {
+str = g_strdup(opt-str);
+qemu_opt_del(opt);
+}
+return str;
+}
+
 bool qemu_opt_has_help_opt(QemuOpts *opts)
 {
 QemuOpt *opt;
@@ -553,6 +579,27 @@ bool qemu_opt_get_bool(QemuOpts *opts, const char *name, 
bool defval)
 return opt-value.boolean;
 }
 
+bool qemu_opt_get_bool_del(QemuOpts *opts, const char *name, bool defval)
+{
+QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+Error *local_err = NULL;
+bool ret = defval;
+
+if (opt == NULL) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+parse_option_bool(name, desc-def_value_str, ret, local_err);
+assert(!local_err);
+}
+return ret;
+}
+assert(opt-desc  opt-desc-type == QEMU_OPT_BOOL);
+ret = opt-value.boolean;
+qemu_opt_del(opt);
+return ret;
+}
+
 uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
 {
 QemuOpt *opt = qemu_opt_find(opts, name);
@@ -589,6 +636,28 @@ uint64_t qemu_opt_get_size(QemuOpts *opts, const

[Qemu-devel] [PATCH V17 7/9] qemu-option: remove QEMUOptionParameter related functions and struct

2013-07-17 Thread Dong Xu Wang

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 include/qemu/option.h |  39 ---
 util/qemu-option.c| 285 --
 2 files changed, 324 deletions(-)

diff --git a/include/qemu/option.h b/include/qemu/option.h
index 12fa8b4..2aaa057 100644
--- a/include/qemu/option.h
+++ b/include/qemu/option.h
@@ -31,24 +31,6 @@
 #include qapi/error.h
 #include qapi/qmp/qdict.h
 
-enum QEMUOptionParType {
-OPT_FLAG,
-OPT_NUMBER,
-OPT_SIZE,
-OPT_STRING,
-};
-
-typedef struct QEMUOptionParameter {
-const char *name;
-enum QEMUOptionParType type;
-union {
-uint64_t n;
-char* s;
-} value;
-const char *help;
-} QEMUOptionParameter;
-
-
 const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
 const char *get_opt_value(char *buf, int buf_size, const char *p);
 int get_next_param_value(char *buf, int buf_size,
@@ -56,27 +38,6 @@ int get_next_param_value(char *buf, int buf_size,
 int get_param_value(char *buf, int buf_size,
 const char *tag, const char *str);
 
-
-/*
- * The following functions take a parameter list as input. This is a pointer to
- * the first element of a QEMUOptionParameter array which is terminated by an
- * entry with entry-name == NULL.
- */
-
-QEMUOptionParameter *get_option_parameter(QEMUOptionParameter *list,
-const char *name);
-int set_option_parameter(QEMUOptionParameter *list, const char *name,
-const char *value);
-int set_option_parameter_int(QEMUOptionParameter *list, const char *name,
-uint64_t value);
-QEMUOptionParameter *append_option_parameters(QEMUOptionParameter *dest,
-QEMUOptionParameter *list);
-QEMUOptionParameter *parse_option_parameters(const char *param,
-QEMUOptionParameter *list, QEMUOptionParameter *dest);
-void free_option_parameters(QEMUOptionParameter *list);
-void print_option_parameters(QEMUOptionParameter *list);
-void print_option_help(QEMUOptionParameter *list);
-
 /* -- */
 
 typedef struct QemuOpt QemuOpt;
diff --git a/util/qemu-option.c b/util/qemu-option.c
index 41f5b0f..7545486 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -123,22 +123,6 @@ int get_param_value(char *buf, int buf_size,
 return get_next_param_value(buf, buf_size, tag, str);
 }
 
-/*
- * Searches an option list for an option with the given name
- */
-QEMUOptionParameter *get_option_parameter(QEMUOptionParameter *list,
-const char *name)
-{
-while (list  list-name) {
-if (!strcmp(list-name, name)) {
-return list;
-}
-list++;
-}
-
-return NULL;
-}
-
 static void parse_option_bool(const char *name, const char *value, bool *ret,
   Error **errp)
 {
@@ -212,275 +196,6 @@ static void parse_option_size(const char *name, const 
char *value,
 }
 }
 
-/*
- * Sets the value of a parameter in a given option list. The parsing of the
- * value depends on the type of option:
- *
- * OPT_FLAG (uses value.n):
- *  If no value is given, the flag is set to 1.
- *  Otherwise the value must be on (set to 1) or off (set to 0)
- *
- * OPT_STRING (uses value.s):
- *  value is strdup()ed and assigned as option value
- *
- * OPT_SIZE (uses value.n):
- *  The value is converted to an integer. Suffixes for kilobytes etc. are
- *  allowed (powers of 1024).
- *
- * Returns 0 on succes, -1 in error cases
- */
-int set_option_parameter(QEMUOptionParameter *list, const char *name,
-const char *value)
-{
-bool flag;
-Error *local_err = NULL;
-
-// Find a matching parameter
-list = get_option_parameter(list, name);
-if (list == NULL) {
-fprintf(stderr, Unknown option '%s'\n, name);
-return -1;
-}
-
-// Process parameter
-switch (list-type) {
-case OPT_FLAG:
-parse_option_bool(name, value, flag, local_err);
-if (!error_is_set(local_err)) {
-list-value.n = flag;
-}
-break;
-
-case OPT_STRING:
-if (value != NULL) {
-list-value.s = g_strdup(value);
-} else {
-fprintf(stderr, Option '%s' needs a parameter\n, name);
-return -1;
-}
-break;
-
-case OPT_SIZE:
-parse_option_size(name, value, list-value.n, local_err);
-break;
-
-default:
-fprintf(stderr, Bug: Option '%s' has an unknown type\n, name);
-return -1;
-}
-
-if (error_is_set(local_err)) {
-qerror_report_err(local_err);
-error_free(local_err);
-return -1;
-}
-
-return 0;
-}
-
-/*
- * Sets the given parameter to an integer instead of a string.
- * This function cannot be used to set string options.
- *
- * Returns 0 on success, -1 in error cases
- */
-int set_option_parameter_int(QEMUOptionParameter *list, const char *name,
-uint64_t value)
-{
-

[Qemu-devel] [PATCH 0/3] dataplane: virtio-blk live migration with x-data-plane=on

2013-07-17 Thread Stefan Hajnoczi

These patches add live migration support to -device 
virtio-blk-pci,x-data-plane=on.

Patch 1 has already been posted and merged into the block tree.  I have
included it for convenience.

Patches 2  3 implement a switch from dataplane mode back to regular virtio-blk
mode when migration starts.  This way live migration works.

If migration is cancelled or the guest accesses the virtio-blk device after
completion, dataplane starts again.

Since this approach is so small, it's more palatable for QEMU 1.6 than trying
to make vring.c log dirty memory.  It makes dataplane usable in situations
where live migration is a requirement.

Stefan Hajnoczi (3):
  dataplane: sync virtio.c and vring.c virtqueue state
  migration: notify migration state before starting thread
  dataplane: enable virtio-blk x-data-plane=on live migration

 hw/block/dataplane/virtio-blk.c | 19 +--
 hw/block/virtio-blk.c   | 32 
 hw/virtio/dataplane/vring.c |  8 +---
 include/hw/virtio/dataplane/vring.h |  2 +-
 include/hw/virtio/virtio-blk.h  |  1 +
 migration.c |  4 +++-
 6 files changed, 51 insertions(+), 15 deletions(-)

-- 
1.8.1.4

[Qemu-devel] [PATCH V17 2/9] qemu-option: avoid duplication of default value in QemuOpts

2013-07-17 Thread Dong Xu Wang

This patch moves the default value entirely to QemuOptDesc.

When getting the value of an option that hasn't been set, and
QemuOptDesc has a default value, return that.  Else, behave as
before.

Example: qemu_opt_get_number(opts, foo, 42)

   If foo has been set in opts, return its value.

   Else, if opt's QemuOptDesc has a default value for foo, return
   that.

   Else, return 42.

   Note that the last argument is useless when QemuOptDesc has a
   default value.  Ugly.  If it bothers us, we could assert.

Example: qemu_opt_get(opts, bar)

   If bar has been set in opts, return its value.

   Else, if opt's QemuOptDesc has a default value for bar, return
   that.

   Else, return NULL.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
Reviewed-by: Eric Blake ebl...@redhat.com


Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
v13-v14:
1) change code style.
2) assert errors.

 util/qemu-option.c | 66 --
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/util/qemu-option.c b/util/qemu-option.c
index be5dfaa..86cc64e 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -495,9 +495,31 @@ static QemuOpt *qemu_opt_find(QemuOpts *opts, const char 
*name)
 return NULL;
 }
 
+static const QemuOptDesc *find_desc_by_name(const QemuOptDesc *desc,
+const char *name)
+{
+int i;
+
+for (i = 0; desc[i].name != NULL; i++) {
+if (strcmp(desc[i].name, name) == 0) {
+return desc[i];
+}
+}
+
+return NULL;
+}
+
 const char *qemu_opt_get(QemuOpts *opts, const char *name)
 {
 QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+
+if (!opt) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+return desc-def_value_str;
+}
+}
 return opt ? opt-str : NULL;
 }
 
@@ -516,9 +538,17 @@ bool qemu_opt_has_help_opt(QemuOpts *opts)
 bool qemu_opt_get_bool(QemuOpts *opts, const char *name, bool defval)
 {
 QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+Error *local_err = NULL;
 
-if (opt == NULL)
+if (opt == NULL) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+parse_option_bool(name, desc-def_value_str, defval, local_err);
+assert(!local_err);
+}
 return defval;
+}
 assert(opt-desc  opt-desc-type == QEMU_OPT_BOOL);
 return opt-value.boolean;
 }
@@ -526,9 +556,17 @@ bool qemu_opt_get_bool(QemuOpts *opts, const char *name, 
bool defval)
 uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
 {
 QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+Error *local_err = NULL;
 
-if (opt == NULL)
+if (opt == NULL) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+parse_option_number(name, desc-def_value_str, defval, 
local_err);
+assert(!local_err);
+}
 return defval;
+}
 assert(opt-desc  opt-desc-type == QEMU_OPT_NUMBER);
 return opt-value.uint;
 }
@@ -536,9 +574,17 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char 
*name, uint64_t defval)
 uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
 {
 QemuOpt *opt = qemu_opt_find(opts, name);
+const QemuOptDesc *desc;
+Error *local_err = NULL;
 
-if (opt == NULL)
+if (opt == NULL) {
+desc = find_desc_by_name(opts-list-desc, name);
+if (desc  desc-def_value_str) {
+parse_option_size(name, desc-def_value_str, defval, local_err);
+assert(!local_err);
+}
 return defval;
+}
 assert(opt-desc  opt-desc-type == QEMU_OPT_SIZE);
 return opt-value.uint;
 }
@@ -579,20 +625,6 @@ static bool opts_accepts_any(const QemuOpts *opts)
 return opts-list-desc[0].name == NULL;
 }
 
-static const QemuOptDesc *find_desc_by_name(const QemuOptDesc *desc,
-const char *name)
-{
-int i;
-
-for (i = 0; desc[i].name != NULL; i++) {
-if (strcmp(desc[i].name, name) == 0) {
-return desc[i];
-}
-}
-
-return NULL;
-}
-
 static void opt_set(QemuOpts *opts, const char *name, const char *value,
 bool prepend, Error **errp)
 {
-- 
1.7.11.7

[Qemu-devel] [PATCH v2 01/11] block: replace in_use with refcnt_soft and refcnt_hard

2013-07-17 Thread Fam Zheng

Introduce refcnt_soft (soft reference) and refcnt_hard (hard reference)
to BlockDriverState, since in_use mechanism cannot provide proper
management of lifecycle when a BDS is referenced in multiple places
(e.g. pointed to by another bs's backing_hd while also used as a block
job device, in the use case of image fleecing).

The original in_use case is considered a hard reference in this patch,
where the bs is busy and should not be used in other tasks that require
a hard reference. (However the interface doesn't force this, caller
still need to call bdrv_in_use() to check by itself.).

A soft reference is implemented but not used yet. It will be used in
following patches to manage the lifecycle together with hard reference.

If bdrv_ref() is called on a BDS, it must be released by exactly the
same numbers of bdrv_unref() with the same soft/hard type, and never
call bdrv_delete() directly. If the BDS is only used locally (unnamed),
bdrv_ref/bdrv_unref can be skipped and just use bdrv_delete().

Signed-off-by: Fam Zheng f...@redhat.com
---
 block-migration.c   |  4 ++--
 block.c | 40 +++-
 blockjob.c  |  6 +++---
 hw/block/dataplane/virtio-blk.c |  4 ++--
 include/block/block.h   |  5 +++--
 include/block/block_int.h   |  3 ++-
 6 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 2fd7699..d558410 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -321,7 +321,7 @@ static void init_blk_migration_it(void *opaque, 
BlockDriverState *bs)
 bmds-shared_base = block_mig_state.shared_base;
 alloc_aio_bitmap(bmds);
 drive_get_ref(drive_get_by_blockdev(bs));
-bdrv_set_in_use(bs, 1);
+bdrv_ref(bs, true);
 
 block_mig_state.total_sector_sum += sectors;
 
@@ -557,7 +557,7 @@ static void blk_mig_cleanup(void)
 blk_mig_lock();
 while ((bmds = QSIMPLEQ_FIRST(block_mig_state.bmds_list)) != NULL) {
 QSIMPLEQ_REMOVE_HEAD(block_mig_state.bmds_list, entry);
-bdrv_set_in_use(bmds-bs, 0);
+bdrv_unref(bmds-bs, true);
 drive_put_ref(drive_get_by_blockdev(bmds-bs));
 g_free(bmds-aio_bitmap);
 g_free(bmds);
diff --git a/block.c b/block.c
index b560241..4170ff6 100644
--- a/block.c
+++ b/block.c
@@ -1511,8 +1511,11 @@ static void bdrv_move_feature_fields(BlockDriverState 
*bs_dest,
 /* dirty bitmap */
 bs_dest-dirty_bitmap   = bs_src-dirty_bitmap;
 
+/* reference count */
+bs_dest-refcnt_soft= bs_src-refcnt_soft;
+bs_dest-refcnt_hard= bs_src-refcnt_hard;
+
 /* job */
-bs_dest-in_use = bs_src-in_use;
 bs_dest-job= bs_src-job;
 
 /* keep the same entry in bdrv_states */
@@ -1542,7 +1545,6 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState 
*bs_old)
 assert(bs_new-dirty_bitmap == NULL);
 assert(bs_new-job == NULL);
 assert(bs_new-dev == NULL);
-assert(bs_new-in_use == 0);
 assert(bs_new-io_limits_enabled == false);
 assert(bs_new-block_timer == NULL);
 
@@ -1561,7 +1563,6 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState 
*bs_old)
 /* Check a few fields that should remain attached to the device */
 assert(bs_new-dev == NULL);
 assert(bs_new-job == NULL);
-assert(bs_new-in_use == 0);
 assert(bs_new-io_limits_enabled == false);
 assert(bs_new-block_timer == NULL);
 
@@ -1598,7 +1599,6 @@ void bdrv_delete(BlockDriverState *bs)
 {
 assert(!bs-dev);
 assert(!bs-job);
-assert(!bs-in_use);
 
 /* remove from list, if necessary */
 bdrv_make_anon(bs);
@@ -4374,15 +4374,37 @@ int64_t bdrv_get_dirty_count(BlockDriverState *bs)
 }
 }
 
-void bdrv_set_in_use(BlockDriverState *bs, int in_use)
+/* Get a soft or hard reference to bs */
+void bdrv_ref(BlockDriverState *bs, bool is_hard)
+{
+if (is_hard) {
+bs-refcnt_hard++;
+} else {
+bs-refcnt_soft++;
+}
+}
+
+/* Release a previously grabbed reference to bs, need to specify if it is hard
+ * or soft. If after this releasing, both soft and hard reference counts are
+ * zero, the BlockDriverState is deleted. */
+void bdrv_unref(BlockDriverState *bs, bool is_hard)
 {
-assert(bs-in_use != in_use);
-bs-in_use = in_use;
+if (is_hard) {
+assert(bs-refcnt_hard  0);
+bs-refcnt_hard--;
+} else {
+assert(bs-refcnt_soft  0);
+bs-refcnt_soft--;
+}
+if (bs-refcnt_hard == 0  bs-refcnt_soft == 0) {
+bdrv_close(bs);
+bdrv_delete(bs);
+}
 }
 
-int bdrv_in_use(BlockDriverState *bs)
+bool bdrv_in_use(BlockDriverState *bs)
 {
-return bs-in_use;
+return bs-refcnt_hard  0;
 }
 
 void bdrv_iostatus_enable(BlockDriverState *bs)
diff --git a/blockjob.c b/blockjob.c
index ca80df1..24f07f9 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -45,7 +45,7 @@ void *block_job_create(const

[Qemu-devel] [PATCH V17 6/9] qapi: query-command-line-options outputs def_value_str

2013-07-17 Thread Dong Xu Wang

QMP command query-command-line-options shows details information of
parameters, since added def_value_str, also output it in the QMP
command.

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
v16-v17:
1) add Since 1.6 tag.
2) rename def_str_value to default.

 qapi-schema.json   | 8 ++--
 qmp-commands.hx| 2 ++
 util/qemu-config.c | 4 
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 7b9fef1..cb9098c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3612,12 +3612,16 @@
 #
 # @help: #optional human readable text string, not suitable for parsing.
 #
-# Since 1.5
+# @default: #optional string representation of the default used
+#   if the option is omitted.
+#
+# Since 1.6
 ##
 { 'type': 'CommandLineParameterInfo',
   'data': { 'name': 'str',
 'type': 'CommandLineParameterType',
-'*help': 'str' } }
+'*help': 'str',
+'*default': 'str'} }
 
 ##
 # @CommandLineOptionInfo:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e075df4..a88f310 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2571,6 +2571,8 @@ Each array entry contains the following:
   or 'size')
 - help: human readable description of the parameter
   (json-string, optional)
+- default: default value string for the parameter
+ (json-string, optional)
 
 Example:
 
diff --git a/util/qemu-config.c b/util/qemu-config.c
index a59568d..315ecbf 100644
--- a/util/qemu-config.c
+++ b/util/qemu-config.c
@@ -67,6 +67,10 @@ static CommandLineParameterInfoList 
*query_option_descs(const QemuOptDesc *desc)
 info-has_help = true;
 info-help = g_strdup(desc[i].help);
 }
+if (desc[i].def_value_str) {
+info-has_q_default = true;
+info-q_default = g_strdup(desc[i].def_value_str);
+}
 
 entry = g_malloc0(sizeof(*entry));
 entry-value = info;
-- 
1.7.11.7

[Qemu-devel] [PATCH v2 02/11] block: use refcnt for bs-backing_hd and bs-file

2013-07-17 Thread Fam Zheng

Signed-off-by: Fam Zheng f...@redhat.com
---
 block.c   | 18 ++
 block/blkdebug.c  |  1 +
 block/blkverify.c |  1 +
 block/snapshot.c  |  2 +-
 block/stream.c|  2 +-
 block/vvfat.c |  1 +
 6 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 4170ff6..7b46669 100644
--- a/block.c
+++ b/block.c
@@ -932,6 +932,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options)
 bs-open_flags |= BDRV_O_NO_BACKING;
 return ret;
 }
+bdrv_ref(bs-backing_hd, false);
 return 0;
 }
 
@@ -1075,9 +1076,11 @@ int bdrv_open(BlockDriverState *bs, const char 
*filename, QDict *options,
 
 if (bs-file != file) {
 bdrv_delete(file);
-file = NULL;
 }
 
+file = NULL;
+bdrv_ref(bs-file, false);
+
 /* If there is a backing file, use it */
 if ((flags  BDRV_O_NO_BACKING) == 0) {
 QDict *backing_options;
@@ -1375,7 +1378,7 @@ void bdrv_close(BlockDriverState *bs)
 
 if (bs-drv) {
 if (bs-backing_hd) {
-bdrv_delete(bs-backing_hd);
+bdrv_unref(bs-backing_hd, false);
 bs-backing_hd = NULL;
 }
 bs-drv-bdrv_close(bs);
@@ -1399,7 +1402,7 @@ void bdrv_close(BlockDriverState *bs)
 bs-options = NULL;
 
 if (bs-file != NULL) {
-bdrv_delete(bs-file);
+bdrv_unref(bs-file, false);
 bs-file = NULL;
 }
 }
@@ -1587,7 +1590,11 @@ void bdrv_append(BlockDriverState *bs_new, 
BlockDriverState *bs_top)
 
 /* The contents of 'tmp' will become bs_top, as we are
  * swapping bs_new and bs_top contents. */
+if (bs_top-backing_hd) {
+bdrv_unref(bs_top-backing_hd, false);
+}
 bs_top-backing_hd = bs_new;
+bdrv_ref(bs_new, false);
 bs_top-open_flags = ~BDRV_O_NO_BACKING;
 pstrcpy(bs_top-backing_file, sizeof(bs_top-backing_file),
 bs_new-filename);
@@ -2108,8 +2115,11 @@ int bdrv_drop_intermediate(BlockDriverState *active, 
BlockDriverState *top,
 if (ret) {
 goto exit;
 }
+if (new_top_bs-backing_hd) {
+bdrv_unref(new_top_bs-backing_hd, false);
+}
 new_top_bs-backing_hd = base_bs;
-
+bdrv_ref(base_bs, false);
 
 QSIMPLEQ_FOREACH_SAFE(intermediate_state, states_to_delete, entry, next) {
 /* so that bdrv_close() does not recursively close the chain */
diff --git a/block/blkdebug.c b/block/blkdebug.c
index ccb627a..c4ddbac 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -389,6 +389,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags)
 if (ret  0) {
 goto fail;
 }
+bdrv_ref(bs-file, false);
 
 ret = 0;
 fail:
diff --git a/block/blkverify.c b/block/blkverify.c
index 1d58cc3..5996045 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -144,6 +144,7 @@ static int blkverify_open(BlockDriverState *bs, QDict 
*options, int flags)
 if (ret  0) {
 goto fail;
 }
+bdrv_ref(bs-file, false);
 
 /* Open the test file */
 filename = qemu_opt_get(opts, x-image);
diff --git a/block/snapshot.c b/block/snapshot.c
index 6c6d9de..4f145fa 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -99,7 +99,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
 ret = bdrv_snapshot_goto(bs-file, snapshot_id);
 open_ret = drv-bdrv_open(bs, NULL, bs-open_flags);
 if (open_ret  0) {
-bdrv_delete(bs-file);
+bdrv_unref(bs-file, false);
 bs-drv = NULL;
 return open_ret;
 }
diff --git a/block/stream.c b/block/stream.c
index 7fe9e48..bcc8dc6 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -68,7 +68,7 @@ static void close_unused_images(BlockDriverState *top, 
BlockDriverState *base,
 unused = intermediate;
 intermediate = intermediate-backing_hd;
 unused-backing_hd = NULL;
-bdrv_delete(unused);
+bdrv_unref(unused, false);
 }
 top-backing_hd = base;
 }
diff --git a/block/vvfat.c b/block/vvfat.c
index 87b0279..353d4ac 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2949,6 +2949,7 @@ static int enable_write_target(BDRVVVFATState *s)
 s-bs-backing_hd-drv = vvfat_write_target;
 s-bs-backing_hd-opaque = g_malloc(sizeof(void*));
 *(void**)s-bs-backing_hd-opaque = s;
+bdrv_ref(s-bs-backing_hd, false);
 
 return 0;
 }
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 05/11] migration: omit drive ref as we have bdrv_ref now

2013-07-17 Thread Fam Zheng

Signed-off-by: Fam Zheng f...@redhat.com
---
 block-migration.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index d558410..d14f4eb 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -320,7 +320,6 @@ static void init_blk_migration_it(void *opaque, 
BlockDriverState *bs)
 bmds-completed_sectors = 0;
 bmds-shared_base = block_mig_state.shared_base;
 alloc_aio_bitmap(bmds);
-drive_get_ref(drive_get_by_blockdev(bs));
 bdrv_ref(bs, true);
 
 block_mig_state.total_sector_sum += sectors;
@@ -558,7 +557,6 @@ static void blk_mig_cleanup(void)
 while ((bmds = QSIMPLEQ_FIRST(block_mig_state.bmds_list)) != NULL) {
 QSIMPLEQ_REMOVE_HEAD(block_mig_state.bmds_list, entry);
 bdrv_unref(bmds-bs, true);
-drive_put_ref(drive_get_by_blockdev(bmds-bs));
 g_free(bmds-aio_bitmap);
 g_free(bmds);
 }
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 04/11] block: use refcnt for device attach/detach

2013-07-17 Thread Fam Zheng

Signed-off-by: Fam Zheng f...@redhat.com
---
 block.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block.c b/block.c
index 7b46669..57a3876 100644
--- a/block.c
+++ b/block.c
@@ -1622,6 +1622,7 @@ int bdrv_attach_dev(BlockDriverState *bs, void *dev)
 return -EBUSY;
 }
 bs-dev = dev;
+bdrv_ref(bs, false);
 bdrv_iostatus_reset(bs);
 return 0;
 }
@@ -1639,6 +1640,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev)
 {
 assert(bs-dev == dev);
 bs-dev = NULL;
+bdrv_unref(bs, false);
 bs-dev_ops = NULL;
 bs-dev_opaque = NULL;
 bs-buffer_alignment = 512;
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 06/11] xen_disk: simplify blk_disconnect with refcnt

2013-07-17 Thread Fam Zheng

We call bdrv_attach_dev when initializing whether or not bs is created
locally, so call bdrv_detach_dev and let the refcnt handle the
lifecycle.

Signed-off-by: Fam Zheng f...@redhat.com
---
 hw/block/xen_disk.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 247f32f..ae17acc 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -910,12 +910,7 @@ static void blk_disconnect(struct XenDevice *xendev)
 struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 
 if (blkdev-bs) {
-if (!blkdev-dinfo) {
-/* close/delete only if we created it ourself */
-bdrv_close(blkdev-bs);
-bdrv_detach_dev(blkdev-bs, blkdev);
-bdrv_delete(blkdev-bs);
-}
+bdrv_detach_dev(blkdev-bs, blkdev);
 blkdev-bs = NULL;
 }
 xen_be_unbind_evtchn(blkdev-xendev);
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 00/11] Point-in-time snapshot exporting over NBD

2013-07-17 Thread Fam Zheng

This series adds for point-in-time snapshot NBD exporting based on
blockdev-backup (variant of drive-backup with existing device as target). This
patch is built on top of imain's sync mode patches for drive-backup.

We get a thin point-in-time snapshot by COW mechanism of drive-backup, and
export it through built in NBD server. The steps are as below:

 1. (SHELL) qemu-img create -f qcow2 BACKUP.qcow2 source size here

(Alternatively we can use -o backing_file=RUNNING-VM.img to omit explicitly
providing the size by ourselves, but it's risky because RUNNING-VM.qcow2 is
used r/w by guest. Whether or not setting backing file in the image file
doesn't matter, as we are going to override the backing hd in the next
step)

 2. (HMP) drive_add backing=ide0-hd0,file=BACKUP.qcow2,id=target0,if=none

(where ide0-hd0 is the running BlockDriverState name for
RUNNING-VM.img. This patch implements backing= option to override
backing_hd for added drive)

 3. (QMP) blockdev-backup device=ide0-hd0 sync=none target=target0

(this is the QMP command introduced by this series, which use a named
device as target of drive-backup)

 4. (QMP) nbd-server-add device=target0

When image fleecing done:

 1. (QMP) block-job-complete device=ide0-hd0

 2. (HMP) drive_del target0

 3. (SHELL) rm BACKUP.qcow2

v2:
* Introduce soft and hard reference count, compared to a single type of ref 
count in v1.
* Add backing= option to drive_add.
* Introduce QMP command blockdev-backup

Fam Zheng (11):
  block: replace in_use with refcnt_soft and refcnt_hard
  block: use refcnt for bs-backing_hd and bs-file
  block: use refcnt for drive_init/drive_uninit
  block: use refcnt for device attach/detach
  migration: omit drive ref as we have bdrv_ref now
  xen_disk: simplify blk_disconnect with refcnt
  block: hold hard reference for backup/mirror target
  block: simplify bdrv_drop_intermediate
  block: add assertion to check refcount before deleting
  block: add option 'backing' to -drive options
  qmp: add command 'blockdev-backup'

 block-migration.c   |   6 +-
 block.c | 155 +---
 block/backup.c  |   3 +-
 block/blkdebug.c|   1 +
 block/blkverify.c   |   1 +
 block/mirror.c  |   4 +-
 block/snapshot.c|   2 +-
 block/stream.c  |   2 +-
 block/vvfat.c   |   1 +
 blockdev.c  |  77 +++-
 blockjob.c  |   6 +-
 hw/block/dataplane/virtio-blk.c |   4 +-
 hw/block/xen_disk.c |   7 +-
 include/block/block.h   |   5 +-
 include/block/block_int.h   |   3 +-
 qapi-schema.json|  49 +
 qmp-commands.hx |  22 ++
 17 files changed, 248 insertions(+), 100 deletions(-)

-- 
1.8.3.2

[Qemu-devel] [PATCH v2 10/11] block: add option 'backing' to -drive options

2013-07-17 Thread Fam Zheng

This option allows overriding backing hd of drive. If the target drive
exists, it's referenced as the backing file and refcount incremented.

Example:
qemu-system-x86_64 -drive \
file.filename=foo.qcow2,if=none,id=foo \
-drive file=bar.qcow2,backing=foo

Signed-off-by: Fam Zheng f...@redhat.com
---
 block.c | 30 --
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 147a448..31fab07 100644
--- a/block.c
+++ b/block.c
@@ -1083,12 +1083,30 @@ int bdrv_open(BlockDriverState *bs, const char 
*filename, QDict *options,
 
 /* If there is a backing file, use it */
 if ((flags  BDRV_O_NO_BACKING) == 0) {
-QDict *backing_options;
-
-extract_subqdict(options, backing_options, backing.);
-ret = bdrv_open_backing_file(bs, backing_options);
-if (ret  0) {
-goto close_and_fail;
+const char *backing_drive;
+backing_drive = qdict_get_try_str(options, backing);
+if (backing_drive) {
+bs-backing_hd = bdrv_find(backing_drive);
+if (bs-backing_hd) {
+bdrv_ref(bs-backing_hd, false);
+pstrcpy(bs-backing_file, sizeof(bs-backing_file),
+bs-backing_hd-filename);
+pstrcpy(bs-backing_format, sizeof(bs-backing_format),
+bs-backing_hd-drv-format_name);
+} else {
+qerror_report(ERROR_CLASS_DEVICE_NOT_FOUND,
+Can't find drive with name %s\n, backing_drive);
+ret = -EINVAL;
+goto close_and_fail;
+}
+qdict_del(options, backing);
+} else {
+QDict *backing_options;
+extract_subqdict(options, backing_options, backing.);
+ret = bdrv_open_backing_file(bs, backing_options);
+if (ret  0) {
+goto close_and_fail;
+}
 }
 }
 
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 03/11] block: use refcnt for drive_init/drive_uninit

2013-07-17 Thread Fam Zheng

Signed-off-by: Fam Zheng f...@redhat.com
---
 blockdev.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index c5abd65..bb986a1 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -212,7 +212,7 @@ static void bdrv_format_print(void *opaque, const char 
*name)
 static void drive_uninit(DriveInfo *dinfo)
 {
 qemu_opts_del(dinfo-opts);
-bdrv_delete(dinfo-bdrv);
+bdrv_unref(dinfo-bdrv, false);
 g_free(dinfo-id);
 QTAILQ_REMOVE(drives, dinfo, next);
 g_free(dinfo-serial);
@@ -709,8 +709,10 @@ DriveInfo *drive_init(QemuOpts *all_opts, 
BlockInterfaceType block_default_type)
 goto err;
 }
 
-if (bdrv_key_required(dinfo-bdrv))
+bdrv_ref(dinfo-bdrv, false);
+if (bdrv_key_required(dinfo-bdrv)) {
 autostart = 0;
+}
 
 qemu_opts_del(opts);
 
-- 
1.8.3.2

[Qemu-devel] [PATCH v2 09/11] block: add assertion to check refcount before deleting

2013-07-17 Thread Fam Zheng

If BDS is managed with refcnt, we should not call bdrv_delete()
directly, instead bdrv_unref() should be used. Adding assertion to
ensure this.

Signed-off-by: Fam Zheng f...@redhat.com
---
 block.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block.c b/block.c
index 499de22..147a448 100644
--- a/block.c
+++ b/block.c
@@ -1606,6 +1606,8 @@ void bdrv_delete(BlockDriverState *bs)
 {
 assert(!bs-dev);
 assert(!bs-job);
+assert(!bs-refcnt_soft);
+assert(!bs-refcnt_hard);
 
 /* remove from list, if necessary */
 bdrv_make_anon(bs);
-- 
1.8.3.2

[Qemu-devel] [PATCH 1/3] dataplane: sync virtio.c and vring.c virtqueue state

2013-07-17 Thread Stefan Hajnoczi

Load the virtio.c state into vring.c when we start dataplane mode and
vice versa when stopping dataplane mode.  This patch makes it possible
to start and stop dataplane any time while the guest is running.

This is very useful since it will allow us to go back to QEMU main loop
for bdrv_drain_all() and live migration.

Signed-off-by: Stefan Hajnoczi stefa...@redhat.com
---
 hw/block/dataplane/virtio-blk.c | 2 +-
 hw/virtio/dataplane/vring.c | 8 +---
 include/hw/virtio/dataplane/vring.h | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 0356665..2faed43 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -537,7 +537,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
 /* Clean up guest notifier (irq) */
 k-set_guest_notifiers(qbus-parent, 1, false);
 
-vring_teardown(s-vring);
+vring_teardown(s-vring, s-vdev, 0);
 s-started = false;
 s-stopping = false;
 }
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index e0d6e83..82cc151 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -39,8 +39,8 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 
 vring_init(vring-vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
 
-vring-last_avail_idx = 0;
-vring-last_used_idx = 0;
+vring-last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n);
+vring-last_used_idx = vring-vr.used-idx;
 vring-signalled_used = 0;
 vring-signalled_used_valid = false;
 
@@ -49,8 +49,10 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
 return true;
 }
 
-void vring_teardown(Vring *vring)
+void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
 {
+virtio_queue_set_last_avail_idx(vdev, n, vring-last_avail_idx);
+
 hostmem_finalize(vring-hostmem);
 }
 
diff --git a/include/hw/virtio/dataplane/vring.h 
b/include/hw/virtio/dataplane/vring.h
index 9380cb5..c0b69ff 100644
--- a/include/hw/virtio/dataplane/vring.h
+++ b/include/hw/virtio/dataplane/vring.h
@@ -50,7 +50,7 @@ static inline void vring_set_broken(Vring *vring)
 }
 
 bool vring_setup(Vring *vring, VirtIODevice *vdev, int n);
-void vring_teardown(Vring *vring);
+void vring_teardown(Vring *vring, VirtIODevice *vdev, int n);
 void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
 bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
-- 
1.8.1.4

[Qemu-devel] [PULL 0/5] pci,net,pc enhancements

2013-07-17 Thread Michael S. Tsirkin

From: Michael S. Tsirkin m...@redhat.com

The following changes since commit 7588e2b0559ae72d3c2952c7807fc05c03099970:

  pci: Fold host_buses list into PCIHostState functionality (2013-07-07 
23:10:57 +0300)

are available in the git repository at:

  git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git tags/for_anthony

for you to fetch changes up to d26d9e14c15837eba2b7447e8d15230bab8e0940:

  pc: don't access fw cfg if NULL (2013-07-15 21:26:32 +0300)


pci,net,pc enhancements

This includes some fixes and enhancements that accumulated in my tree:
pci fixes by dkoch, virtio-net enhancements by akong and mst,
and a fix for xen pc by mst.

Signed-off-by: Michael S. Tsirkin m...@redhat.com


Amos Kong (1):
  net: add support of mac-programming over macvtap in QEMU side

Don Koch (2):
  pci-bridge: update mappings for migration/restore
  pci: fix BRDIGE typo

Michael S. Tsirkin (2):
  virtio-net: add feature bit for any header s/g
  pc: don't access fw cfg if NULL

 QMP/qmp-events.txt |  17 ++
 hw/i386/pc.c   |   2 +-
 hw/net/virtio-net.c| 133 +++--
 hw/pci-bridge/i82801b11.c  |   2 +-
 hw/pci/pci.c   |   5 ++
 hw/pci/pci_bridge.c|   2 +-
 include/hw/i386/pc.h   |   4 ++
 include/hw/pci/pci_bridge.h|   1 +
 include/hw/pci/pci_ids.h   |   2 +-
 include/hw/virtio/virtio-net.h |   1 +
 include/hw/virtio/virtio.h |   2 +
 include/monitor/monitor.h  |   1 +
 include/net/net.h  |   3 +
 monitor.c  |   1 +
 net/net.c  |  48 +++
 qapi-schema.json   |  76 +++
 qmp-commands.hx|  63 +++
 17 files changed, 353 insertions(+), 10 deletions(-)

[Qemu-devel] [PATCH v2 11/11] qmp: add command 'blockdev-backup'

2013-07-17 Thread Fam Zheng

Similar to drive-backup, but this command uses a device id as target
instead of creating/opening an image file.

Signed-off-by: Fam Zheng f...@redhat.com
---
 blockdev.c   | 71 
 qapi-schema.json | 49 ++
 qmp-commands.hx  | 22 ++
 3 files changed, 142 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index bb986a1..a3fa5d1 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1517,7 +1517,78 @@ void qmp_drive_backup(const char *device, const char 
*target,
 error_propagate(errp, local_err);
 return;
 }
+/* Grab a reference so hotplug does not delete the BlockDriverState from
+ * underneath us.
+ */
+drive_get_ref(drive_get_by_blockdev(bs));
+}
+
+void qmp_blockdev_backup(const char *device, const char *target,
+   enum MirrorSyncMode sync,
+   bool has_speed, int64_t speed,
+   bool has_on_source_error,
+   BlockdevOnError on_source_error,
+   bool has_on_target_error,
+   BlockdevOnError on_target_error,
+   Error **errp)
+{
+BlockDriverState *bs;
+BlockDriverState *target_bs;
+Error *local_err = NULL;
+
+if (sync != MIRROR_SYNC_MODE_FULL) {
+error_setg(errp, only sync mode 'full' is currently supported);
+return;
+}
+if (!has_speed) {
+speed = 0;
+}
+if (!has_on_source_error) {
+on_source_error = BLOCKDEV_ON_ERROR_REPORT;
+}
+if (!has_on_target_error) {
+on_target_error = BLOCKDEV_ON_ERROR_REPORT;
+}
+
+bs = bdrv_find(device);
+if (!bs) {
+error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+return;
+}
+
+if (!bdrv_is_inserted(bs)) {
+error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
+return;
+}
+
+if (bdrv_in_use(bs)) {
+error_set(errp, QERR_DEVICE_IN_USE, device);
+return;
+}
+
+target_bs = bdrv_find(target);
+if (!target_bs) {
+error_set(errp, QERR_DEVICE_NOT_FOUND, target);
+return;
+}
+
+if (!bdrv_is_inserted(target_bs)) {
+error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, target);
+return;
+}
+
+if (bdrv_in_use(target_bs)) {
+error_set(errp, QERR_DEVICE_IN_USE, target);
+return;
+}
 
+backup_start(bs, target_bs, speed, on_source_error, on_target_error,
+ block_job_cb, bs, local_err);
+if (local_err != NULL) {
+bdrv_delete(target_bs);
+error_propagate(errp, local_err);
+return;
+}
 /* Grab a reference so hotplug does not delete the BlockDriverState from
  * underneath us.
  */
diff --git a/qapi-schema.json b/qapi-schema.json
index 7b9fef1..3b7cfcc 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1665,6 +1665,40 @@
 '*on-target-error': 'BlockdevOnError' } }
 
 ##
+# @BlockdevBackup
+#
+# @device: the name of the device which should be copied.
+#
+# @target: the name of the backup target device
+#
+# @sync: what parts of the disk image should be copied to the destination
+#(all the disk, only the sectors allocated in the topmost image, or
+#only new I/O).
+#
+# @speed: #optional the maximum speed, in bytes per second
+#
+# @on-source-error: #optional the action to take on an error on the source,
+#   default 'report'.  'stop' and 'enospc' can only be used
+#   if the block device supports io-status (see BlockInfo).
+#
+# @on-target-error: #optional the action to take on an error on the target,
+#   default 'report' (no limitations, since this applies to
+#   a different block device than @device).
+#
+# Note that @on-source-error and @on-target-error only affect background I/O.
+# If an error occurs during a guest write request, the device's rerror/werror
+# actions will be used.
+#
+# Since: 1.6
+##
+{ 'type': 'BlockdevBackup',
+  'data': { 'device': 'str', 'target': 'str',
+'sync': 'MirrorSyncMode',
+'*speed': 'int',
+'*on-source-error': 'BlockdevOnError',
+'*on-target-error': 'BlockdevOnError' } }
+
+##
 # @Abort
 #
 # This action can be used to test transaction failure.
@@ -1806,6 +1840,21 @@
 { 'command': 'drive-backup', 'data': 'DriveBackup' }
 
 ##
+# @blockdev-backup
+#
+# Block device version for drive-backup command. Use existing device as target
+# of backup.
+#
+# For the arguments, see the documentation of BlockdevBackup.
+#
+# Returns: nothing on success
+#  If @device or @target is not a valid block device, DeviceNotFound
+#
+# Since 1.6
+##
+{ 'command': 'blockdev-backup', 'data': 'BlockdevBackup' }
+
+##
 # @drive-mirror
 #
 # Start mirroring a block device's writes to a new destination.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index e075df4..ecd9f64 100644
---

[Qemu-devel] [PATCH v2 07/11] block: hold hard reference for backup/mirror target

2013-07-17 Thread Fam Zheng

Signed-off-by: Fam Zheng f...@redhat.com
---
 block/backup.c | 3 ++-
 block/mirror.c | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 16105d4..b82f601 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -294,7 +294,7 @@ static void coroutine_fn backup_run(void *opaque)
 hbitmap_free(job-bitmap);
 
 bdrv_iostatus_disable(target);
-bdrv_delete(target);
+bdrv_unref(target, true);
 
 block_job_completed(job-common, ret);
 }
@@ -332,6 +332,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState 
*target,
 return;
 }
 
+bdrv_ref(target, true);
 job-on_source_error = on_source_error;
 job-on_target_error = on_target_error;
 job-target = target;
diff --git a/block/mirror.c b/block/mirror.c
index bed4a7e..decdedb 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -479,8 +479,7 @@ immediate_exit:
 }
 bdrv_swap(s-target, s-common.bs);
 }
-bdrv_close(s-target);
-bdrv_delete(s-target);
+bdrv_unref(s-target, true);
 block_job_completed(s-common, ret);
 }
 
@@ -574,6 +573,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState 
*target,
 s-granularity = granularity;
 s-buf_size = MAX(buf_size, granularity);
 
+bdrv_ref(target, true);
 bdrv_set_dirty_tracking(bs, granularity);
 bdrv_set_enable_write_cache(s-target, true);
 bdrv_set_on_error(s-target, on_target_error, on_target_error);
-- 
1.8.3.2

[Qemu-devel] [PULL 1/5] pci-bridge: update mappings for migration/restore

2013-07-17 Thread Michael S. Tsirkin

From: Don Koch dk...@verizon.com

Fix for LP#1187529: Devices on PCI bridge stop working when
live-migrated. Update bridge mappings for all PCI bridge
devices in get_pci_config_device().

Signed-off-by: Don Koch dk...@verizon.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci/pci.c| 5 +
 hw/pci/pci_bridge.c | 2 +-
 include/hw/pci/pci_bridge.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 8680063..8087c18 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -378,6 +378,7 @@ int pci_bus_num(PCIBus *s)
 static int get_pci_config_device(QEMUFile *f, void *pv, size_t size)
 {
 PCIDevice *s = container_of(pv, PCIDevice, config);
+PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(s);
 uint8_t *config;
 int i;
 
@@ -395,6 +396,10 @@ static int get_pci_config_device(QEMUFile *f, void *pv, 
size_t size)
 memcpy(s-config, config, size);
 
 pci_update_mappings(s);
+if (pc-is_bridge) {
+PCIBridge *b = container_of(s, PCIBridge, dev);
+pci_bridge_update_mappings(b);
+}
 
 memory_region_set_enabled(s-bus_master_enable_region,
   pci_get_word(s-config + PCI_COMMAND)
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 24be6c5..3897bd8 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -224,7 +224,7 @@ static void pci_bridge_region_cleanup(PCIBridge *br, 
PCIBridgeWindows *w)
 g_free(w);
 }
 
-static void pci_bridge_update_mappings(PCIBridge *br)
+void pci_bridge_update_mappings(PCIBridge *br)
 {
 PCIBridgeWindows *w = br-windows;
 
diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
index 1868f7a..1d8f997 100644
--- a/include/hw/pci/pci_bridge.h
+++ b/include/hw/pci/pci_bridge.h
@@ -37,6 +37,7 @@ PCIBus *pci_bridge_get_sec_bus(PCIBridge *br);
 pcibus_t pci_bridge_get_base(const PCIDevice *bridge, uint8_t type);
 pcibus_t pci_bridge_get_limit(const PCIDevice *bridge, uint8_t type);
 
+void pci_bridge_update_mappings(PCIBridge *br);
 void pci_bridge_write_config(PCIDevice *d,
  uint32_t address, uint32_t val, int len);
 void pci_bridge_disable_base_limit(PCIDevice *dev);
-- 
MST

[Qemu-devel] [PULL 3/5] net: add support of mac-programming over macvtap in QEMU side

2013-07-17 Thread Michael S. Tsirkin

From: Amos Kong ak...@redhat.com

Currently macvtap based macvlan device is working in promiscuous
mode, we want to implement mac-programming over macvtap through
Libvirt for better performance.

Design:
 QEMU notifies Libvirt when rx-filter config is changed in guest,
 then Libvirt query the rx-filter information by a monitor command,
 and sync the change to macvtap device. Related rx-filter config
 of the nic contains main mac, rx-mode items and vlan table.

This patch adds a QMP event to notify management of rx-filter change,
and adds a monitor command for management to query rx-filter
information.

Test:
 If we repeatedly add/remove vlan, and change macaddr of vlan
 interfaces in guest by a loop script.

Result:
 The events will flood the QMP client(management), management takes
 too much resource to process the events.

 Event_throttle API (set rate to 1 ms) can avoid the events to flood
 QMP client, but it could cause an unexpected delay (~1ms), guests
 guests normally expect rx-filter updates immediately.

 So we use a flag for each nic to avoid events flooding, the event
 is emitted once until the query command is executed. The flag
 implementation could not introduce unexpected delay.

There maybe exist an uncontrollable delay if we let Libvirt do the
real change, guests normally expect rx-filter updates immediately.
But it's another separate issue, we can investigate it when the
work in Libvirt side is done.

Michael S. Tsirkin: tweaked to enable events on start
Michael S. Tsirkin: fixed not to crash when no id
Michael S. Tsirkin: fold in patch:
   additional fixes for mac-programming feature
Amos Kong: always notify QMP client if mactable is changed
Amos Kong: return NULL list if no net client supports rx-filter query

Reviewed-by: Eric Blake ebl...@redhat.com
Reviewed-by: Markus Armbruster arm...@redhat.com
Signed-off-by: Amos Kong ak...@redhat.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 QMP/qmp-events.txt|  17 ++
 hw/net/virtio-net.c   | 133 +++---
 include/monitor/monitor.h |   1 +
 include/net/net.h |   3 ++
 monitor.c |   1 +
 net/net.c |  48 +
 qapi-schema.json  |  76 ++
 qmp-commands.hx   |  63 ++
 8 files changed, 336 insertions(+), 6 deletions(-)

diff --git a/QMP/qmp-events.txt b/QMP/qmp-events.txt
index 24e804e9..39b6016 100644
--- a/QMP/qmp-events.txt
+++ b/QMP/qmp-events.txt
@@ -172,6 +172,23 @@ Data:
   },
   timestamp: { seconds: 1265044230, microseconds: 450486 } }
 
+NIC_RX_FILTER_CHANGED
+-
+
+The event is emitted once until the query command is executed,
+the first event will always be emitted.
+
+Data:
+
+- name: net client name (json-string)
+- path: device path (json-string)
+
+{ event: NIC_RX_FILTER_CHANGED,
+  data: { name: vnet0,
+path: /machine/peripheral/vnet0/virtio-backend },
+  timestamp: { seconds: 1368697518, microseconds: 326866 } }
+}
+
 RESET
 -
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1ea9556..679f50c 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -21,6 +21,8 @@
 #include hw/virtio/virtio-net.h
 #include net/vhost_net.h
 #include hw/virtio/virtio-bus.h
+#include qapi/qmp/qjson.h
+#include monitor/monitor.h
 
 #define VIRTIO_NET_VM_VERSION11
 
@@ -192,6 +194,105 @@ static void virtio_net_set_link_status(NetClientState *nc)
 virtio_net_set_status(vdev, vdev-status);
 }
 
+static void rxfilter_notify(NetClientState *nc)
+{
+QObject *event_data;
+VirtIONet *n = qemu_get_nic_opaque(nc);
+
+if (nc-rxfilter_notify_enabled) {
+if (n-netclient_name) {
+event_data = qobject_from_jsonf({ 'name': %s, 'path': %s },
+n-netclient_name,
+
object_get_canonical_path(OBJECT(n-qdev)));
+} else {
+event_data = qobject_from_jsonf({ 'path': %s },
+
object_get_canonical_path(OBJECT(n-qdev)));
+}
+monitor_protocol_event(QEVENT_NIC_RX_FILTER_CHANGED, event_data);
+qobject_decref(event_data);
+
+/* disable event notification to avoid events flooding */
+nc-rxfilter_notify_enabled = 0;
+}
+}
+
+static char *mac_strdup_printf(const uint8_t *mac)
+{
+return g_strdup_printf(%.2x:%.2x:%.2x:%.2x:%.2x:%.2x, mac[0],
+mac[1], mac[2], mac[3], mac[4], mac[5]);
+}
+
+static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
+{
+VirtIONet *n = qemu_get_nic_opaque(nc);
+RxFilterInfo *info;
+strList *str_list, *entry;
+intList *int_list, *int_entry;
+int i, j;
+
+info = g_malloc0(sizeof(*info));
+info-name = g_strdup(nc-name);
+info-promiscuous = n-promisc;
+
+if (n-nouni) {
+info-unicast = RX_STATE_NONE;
+} else if (n-alluni) {
+info-unicast =

[Qemu-devel] [PATCH v2 08/11] block: simplify bdrv_drop_intermediate

2013-07-17 Thread Fam Zheng

bdrv_drop_intermediate used a local list to iterate through backing
chain and delete each BDS. It is simplified while adopting to refcount
mechanism.

Signed-off-by: Fam Zheng f...@redhat.com
---
 block.c | 71 ++---
 1 file changed, 11 insertions(+), 60 deletions(-)

diff --git a/block.c b/block.c
index 57a3876..499de22 100644
--- a/block.c
+++ b/block.c
@@ -2027,12 +2027,6 @@ BlockDriverState *bdrv_find_overlay(BlockDriverState 
*active,
 return overlay;
 }
 
-typedef struct BlkIntermediateStates {
-BlockDriverState *bs;
-QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
-} BlkIntermediateStates;
-
-
 /*
  * Drops images above 'base' up to and including 'top', and sets the image
  * above 'top' to have base as its backing file.
@@ -2062,15 +2056,9 @@ typedef struct BlkIntermediateStates {
 int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
BlockDriverState *base)
 {
-BlockDriverState *intermediate;
-BlockDriverState *base_bs = NULL;
 BlockDriverState *new_top_bs = NULL;
-BlkIntermediateStates *intermediate_state, *next;
 int ret = -EIO;
 
-QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
-QSIMPLEQ_INIT(states_to_delete);
-
 if (!top-drv || !base-drv) {
 goto exit;
 }
@@ -2082,58 +2070,21 @@ int bdrv_drop_intermediate(BlockDriverState *active, 
BlockDriverState *top,
 goto exit;
 }
 
-/* special case of new_top_bs-backing_hd already pointing to base - 
nothing
- * to do, no intermediate images */
-if (new_top_bs-backing_hd == base) {
-ret = 0;
-goto exit;
-}
-
-intermediate = top;
-
-/* now we will go down through the list, and add each BDS we find
- * into our deletion queue, until we hit the 'base'
- */
-while (intermediate) {
-intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
-intermediate_state-bs = intermediate;
-QSIMPLEQ_INSERT_TAIL(states_to_delete, intermediate_state, entry);
-
-if (intermediate-backing_hd == base) {
-base_bs = intermediate-backing_hd;
-break;
+while (new_top_bs-backing_hd  new_top_bs-backing_hd != base) {
+BlockDriverState *backing = new_top_bs-backing_hd;
+if (backing == NULL) {
+goto exit;
 }
-intermediate = intermediate-backing_hd;
-}
-if (base_bs == NULL) {
-/* something went wrong, we did not end at the base. safely
- * unravel everything, and exit with error */
-goto exit;
+new_top_bs-backing_hd = backing-backing_hd;
+/* break backing_hd chain before releasing bs, so we don't free all the
+ * way up the backing chain */
+backing-backing_hd = NULL;
+bdrv_unref(backing, false);
 }
 
-/* success - we can delete the intermediate states, and link top-base */
-ret = bdrv_change_backing_file(new_top_bs, base_bs-filename,
-   base_bs-drv ? base_bs-drv-format_name : 
);
-if (ret) {
-goto exit;
-}
-if (new_top_bs-backing_hd) {
-bdrv_unref(new_top_bs-backing_hd, false);
-}
-new_top_bs-backing_hd = base_bs;
-bdrv_ref(base_bs, false);
-
-QSIMPLEQ_FOREACH_SAFE(intermediate_state, states_to_delete, entry, next) {
-/* so that bdrv_close() does not recursively close the chain */
-intermediate_state-bs-backing_hd = NULL;
-bdrv_delete(intermediate_state-bs);
-}
-ret = 0;
-
+ret = bdrv_change_backing_file(new_top_bs, base-filename,
+   base-drv ? base-drv-format_name : );
 exit:
-QSIMPLEQ_FOREACH_SAFE(intermediate_state, states_to_delete, entry, next) {
-g_free(intermediate_state);
-}
 return ret;
 }
 
-- 
1.8.3.2

[Qemu-devel] [PULL 4/5] virtio-net: add feature bit for any header s/g

2013-07-17 Thread Michael S. Tsirkin

Old qemu versions required that 1st s/g entry is the header.

Since QEMU 1.5, patchset titled virtio-net: iovec handling cleanup
removed this limitation but a feature bit is needed so guests know it's
safe to lay out header differently.

This patch applies on top and adds such a feature bit to QEMU.
It is set by default for virtio-net.
virtio net header inline with the data is beneficial
for latency and small packet bandwidth - guest driver
code utilizing this feature has been acked but missed 3.11
by a narrow margin, it's pending for 3.12.

This feature bit is cleared by default when compatibility with old
machine types is requested.

Other performance-sensitive devices (blk and scsi)
don't yet support arbitrary s/g layouts, so
we only set this bit for virtio-net for now.
There are plans to allow arbitrary layouts there, but
no code has been posted yet.

Cc: Rusty Russell ru...@rustcorp.com.au
Reviewed-by: Laszlo Ersek ler...@redhat.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 include/hw/i386/pc.h   | 4 
 include/hw/virtio/virtio-net.h | 1 +
 include/hw/virtio/virtio.h | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 5949e7e..751592a 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -229,6 +229,10 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
 .driver   = Nehalem- TYPE_X86_CPU,\
 .property = level,\
 .value= stringify(2),\
+},{\
+.driver   = virtio-net-pci,\
+.property = any_layout,\
+.value= off,\
 }
 
 #define PC_COMPAT_1_4 \
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index b315ac9..df60f16 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -243,6 +243,7 @@ struct virtio_net_ctrl_mq {
 
 #define DEFINE_VIRTIO_NET_FEATURES(_state, _field) \
 DEFINE_VIRTIO_COMMON_FEATURES(_state, _field), \
+DEFINE_PROP_BIT(any_layout, _state, _field, VIRTIO_F_ANY_LAYOUT, 
true), \
 DEFINE_PROP_BIT(csum, _state, _field, VIRTIO_NET_F_CSUM, true), \
 DEFINE_PROP_BIT(guest_csum, _state, _field, VIRTIO_NET_F_GUEST_CSUM, 
true), \
 DEFINE_PROP_BIT(gso, _state, _field, VIRTIO_NET_F_GSO, true), \
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index a6c5c53..5d1d2be 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -43,6 +43,8 @@
 /* We notify when the ring is completely used, even if the guest is suppressing
  * callbacks */
 #define VIRTIO_F_NOTIFY_ON_EMPTY24
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT 27
 /* We support indirect buffer descriptors */
 #define VIRTIO_RING_F_INDIRECT_DESC 28
 /* The Guest publishes the used index for which it expects an interrupt
-- 
MST

[Qemu-devel] [PATCH] RFC: hcd-ohci: add dma error handling

2013-07-17 Thread Alexey Kardashevskiy

From: Benjamin Herrenschmidt b...@kernel.crashing.org

Current hcd-ohci does not handle DMA errors which can actually
happen.

However it is not clear what approach should be used here -
for example, get_dwords returns positive number saying that there
is no error as all the callers consider the return value as fail
if it is less than zero. Normally you would expect bool=true/int=0
as success and bool=false/int=-1 as fail.

Any suggestion?

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/usb/hcd-ohci.c | 77 +--
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index a096ecf..48297da 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -508,7 +508,9 @@ static inline int get_dwords(OHCIState *ohci,
 addr += ohci-localmem_base;
 
 for (i = 0; i  num; i++, buf++, addr += sizeof(*buf)) {
-dma_memory_read(ohci-as, addr, buf, sizeof(*buf));
+if (dma_memory_read(ohci-as, addr, buf, sizeof(*buf))) {
+return -1;
+}
 *buf = le32_to_cpu(*buf);
 }
 
@@ -525,7 +527,9 @@ static inline int put_dwords(OHCIState *ohci,
 
 for (i = 0; i  num; i++, buf++, addr += sizeof(*buf)) {
 uint32_t tmp = cpu_to_le32(*buf);
-dma_memory_write(ohci-as, addr, tmp, sizeof(tmp));
+if (dma_memory_write(ohci-as, addr, tmp, sizeof(tmp))) {
+return -1;
+}
 }
 
 return 1;
@@ -540,7 +544,9 @@ static inline int get_words(OHCIState *ohci,
 addr += ohci-localmem_base;
 
 for (i = 0; i  num; i++, buf++, addr += sizeof(*buf)) {
-dma_memory_read(ohci-as, addr, buf, sizeof(*buf));
+if (dma_memory_read(ohci-as, addr, buf, sizeof(*buf))) {
+return -1;
+}
 *buf = le16_to_cpu(*buf);
 }
 
@@ -557,7 +563,9 @@ static inline int put_words(OHCIState *ohci,
 
 for (i = 0; i  num; i++, buf++, addr += sizeof(*buf)) {
 uint16_t tmp = cpu_to_le16(*buf);
-dma_memory_write(ohci-as, addr, tmp, sizeof(tmp));
+if (dma_memory_write(ohci-as, addr, tmp, sizeof(tmp))) {
+return -1;
+}
 }
 
 return 1;
@@ -585,8 +593,8 @@ static inline int ohci_read_iso_td(OHCIState *ohci,
 static inline int ohci_read_hcca(OHCIState *ohci,
  dma_addr_t addr, struct ohci_hcca *hcca)
 {
-dma_memory_read(ohci-as, addr + ohci-localmem_base, hcca, sizeof(*hcca));
-return 1;
+return dma_memory_read(ohci-as, addr + ohci-localmem_base,
+   hcca, sizeof(*hcca)) ? -1 : 0;
 }
 
 static inline int ohci_put_ed(OHCIState *ohci,
@@ -617,16 +625,15 @@ static inline int ohci_put_iso_td(OHCIState *ohci,
 static inline int ohci_put_hcca(OHCIState *ohci,
 dma_addr_t addr, struct ohci_hcca *hcca)
 {
-dma_memory_write(ohci-as,
- addr + ohci-localmem_base + HCCA_WRITEBACK_OFFSET,
- (char *)hcca + HCCA_WRITEBACK_OFFSET,
- HCCA_WRITEBACK_SIZE);
-return 1;
+return dma_memory_write(ohci-as,
+addr + ohci-localmem_base + HCCA_WRITEBACK_OFFSET,
+(char *)hcca + HCCA_WRITEBACK_OFFSET,
+HCCA_WRITEBACK_SIZE) ? -1 : 0;
 }
 
 /* Read/Write the contents of a TD from/to main memory.  */
-static void ohci_copy_td(OHCIState *ohci, struct ohci_td *td,
- uint8_t *buf, int len, DMADirection dir)
+static int ohci_copy_td(OHCIState *ohci, struct ohci_td *td,
+uint8_t *buf, int len, DMADirection dir)
 {
 dma_addr_t ptr, n;
 
@@ -634,18 +641,26 @@ static void ohci_copy_td(OHCIState *ohci, struct ohci_td 
*td,
 n = 0x1000 - (ptr  0xfff);
 if (n  len)
 n = len;
-dma_memory_rw(ohci-as, ptr + ohci-localmem_base, buf, n, dir);
-if (n == len)
-return;
+
+if (dma_memory_rw(ohci-as, ptr + ohci-localmem_base, buf, n, dir)) {
+return -1;
+}
+if (n == len) {
+return 0;
+}
 ptr = td-be  ~0xfffu;
 buf += n;
-dma_memory_rw(ohci-as, ptr + ohci-localmem_base, buf, len - n, dir);
+if (dma_memory_rw(ohci-as, ptr + ohci-localmem_base, buf,
+  len - n, dir)) {
+return -1;
+}
+return 0;
 }
 
 /* Read/Write the contents of an ISO TD from/to main memory.  */
-static void ohci_copy_iso_td(OHCIState *ohci,
- uint32_t start_addr, uint32_t end_addr,
- uint8_t *buf, int len, DMADirection dir)
+static int ohci_copy_iso_td(OHCIState *ohci,
+uint32_t start_addr, uint32_t end_addr,
+uint8_t *buf, int len, DMADirection dir)
 {
 dma_addr_t ptr, n;
 
@@ -653,12 +668,20 @@ static void ohci_copy_iso_td(OHCIState *ohci,
 n = 0x1000 - (ptr  0xfff);
 if (n  len)
 n = len;
-

[Qemu-devel] [PULL 5/5] pc: don't access fw cfg if NULL

2013-07-17 Thread Michael S. Tsirkin

commit f8c457b88d72a48989f190bc3d7b79f4f3b7d11c
 pc: pass PCI hole ranges to Guests
broke Xen as it has no fw_cfg.
Check for this configuration and boil out.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Tested-by: Stefano Stabellini stefano.stabell...@eu.citrix.com
---
 hw/i386/pc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 80c27d6..cb3897e 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1000,7 +1000,7 @@ typedef struct PcRomPciInfo {
 static void pc_fw_cfg_guest_info(PcGuestInfo *guest_info)
 {
 PcRomPciInfo *info;
-if (!guest_info-has_pci_info) {
+if (!guest_info-has_pci_info || !guest_info-fw_cfg) {
 return;
 }
 
-- 
MST

[Qemu-devel] [PATCH V17 8/9] qemu-option: make qemu_opts_del accept opts being NULL

2013-07-17 Thread Dong Xu Wang

Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
---
 util/qemu-option.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/util/qemu-option.c b/util/qemu-option.c
index 7545486..f4a0282 100644
--- a/util/qemu-option.c
+++ b/util/qemu-option.c
@@ -656,6 +656,10 @@ void qemu_opts_del(QemuOpts *opts)
 {
 QemuOpt *opt;
 
+if (!opts) {
+return;
+}
+
 for (;;) {
 opt = QTAILQ_FIRST(opts-head);
 if (opt == NULL)
-- 
1.7.11.7

Re: [Qemu-devel] [PULL 3/5] exec: Support 64-bit operations in address_space_rw

2013-07-17 Thread Markus Armbruster

Richard Henderson r...@twiddle.net writes:

 Honor the implementation maximum access size, and at least check
 the minimum access size.

 Reviewed-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Richard Henderson r...@twiddle.net

Fails for me:

qemu-system-x86_64: /work/armbru/qemu/exec.c:1927: memory_access_size: 
Assertion `l = access_size_min' failed.

Workaround: disable USB.

Backtrace:

#3  0x003707a2e752 in __assert_fail () from /lib64/libc.so.6
#4  0x006951e3 in memory_access_size (mr=0x1116c00, l=1, addr=12)
at /work/armbru/qemu/exec.c:1927
#5  0x0069529d in address_space_rw (as=0x1045920, addr=49228, buf=
0x75392af8 @\240\377\367L\300, len=1, is_write=true)
at /work/armbru/qemu/exec.c:1948
#6  0x0069566b in address_space_write (as=0x1045920, addr=49228, buf=
0x75392af8 @\240\377\367L\300, len=1)
at /work/armbru/qemu/exec.c:2027
#7  0x00701356 in cpu_outb (addr=49228, val=64 '@')
at /work/armbru/qemu/ioport.c:51
#8  0x00705ab1 in kvm_handle_io (port=49228, data=0x77ffa000, 
direction=1, size=1, count=1) at /work/armbru/qemu/kvm-all.c:1517
#9  0x0070606f in kvm_cpu_exec (cpu=0x10d3c90)
at /work/armbru/qemu/kvm-all.c:1664
#10 0x00687da7 in qemu_kvm_cpu_thread_fn (arg=0x10d3c90)
at /work/armbru/qemu/cpus.c:751
#11 0x003708607d14 in start_thread () from /lib64/libpthread.so.0
#12 0x003707af168d in clone () from /lib64/libc.so.6

Hopefully useful state:

(gdb) p *mr
$1 = {ops = 0x8b0a20, iommu_ops = 0x0, opaque = 0x1116450, owner = 0x0, 
  parent = 0x10c3a10, size = {lo = 32, hi = 0}, addr = 49216, destructor = 
0x70b2a4 memory_region_destructor_none, ram_addr = 18446744073709551615, 
  subpage = false, terminates = true, romd_mode = true, ram = false, 
  readonly = false, enabled = true, rom_device = false, warning_printed = 
false, flush_coalesced_mmio = false, alias = 0x0, alias_offset = 0, 
  priority = 1, may_overlap = true, subregions = {tqh_first = 0x0, tqh_last = 
0x1116c78}, subregions_link = {tqe_next = 0x111b2e8, tqe_prev = 
0x74a9c928}, coalesced = {tqh_first = 0x0, tqh_last = 0x1116c98}, 
  name = 0x11174e0 uhci, dirty_log_mask = 0 '\000', ioeventfd_nb = 0, 
  ioeventfds = 0x0, iommu_notify = {notifiers = {lh_first = 0x0}}}
(gdb) p *mr-ops
$2 = {read = 0x5becdf uhci_port_read, write = 0x5be8cd uhci_port_write, 
  endianness = DEVICE_LITTLE_ENDIAN, valid = {min_access_size = 1, 
max_access_size = 4, unaligned = false, accepts = 0}, impl = {
min_access_size = 2, max_access_size = 2, unaligned = false}, old_mmio = {
read = {0, 0, 0}, write = {0, 0, 0}}}

info mtree
memory
-7ffe (prio 0, RW): system
  -1fff (prio 0, RW): alias ram-below-4g @pc.ram 
-1fff
  000a-000b (prio 1, RW): alias smram-region @pci 
000a-000b
  000c-000c3fff (prio 1, RW): alias pam-pci @pci 
000c-000c3fff
  000c4000-000c7fff (prio 1, RW): alias pam-pci @pci 
000c4000-000c7fff
  000c8000-000cbfff (prio 1, RW): alias pam-pci @pci 
000c8000-000cbfff
  000cc000-000c (prio 1, RW): alias pam-pci @pci 
000cc000-000c
  000d-000d3fff (prio 1, RW): alias pam-pci @pci 
000d-000d3fff
  000d4000-000d7fff (prio 1, RW): alias pam-pci @pci 
000d4000-000d7fff
  000d8000-000dbfff (prio 1, RW): alias pam-pci @pci 
000d8000-000dbfff
  000dc000-000d (prio 1, RW): alias pam-pci @pci 
000dc000-000d
  000e-000e3fff (prio 1, RW): alias pam-pci @pci 
000e-000e3fff
  000e4000-000e7fff (prio 1, RW): alias pam-pci @pci 
000e4000-000e7fff
  000e8000-000ebfff (prio 1, RW): alias pam-pci @pci 
000e8000-000ebfff
  000ec000-000e (prio 1, RW): alias pam-pci @pci 
000ec000-000e
  000f-000f (prio 1, RW): alias pam-pci @pci 
000f-000f
  2000- (prio 0, RW): alias pci-hole @pci 
2000-
  fec0-fec00fff (prio 0, RW): kvm-ioapic
  fed0-fed003ff (prio 0, RW): hpet
  fee0-feef (prio 4096, RW): icc-apic-container
fee0-feef (prio 0, RW): kvm-apic-msi
  0001-4000 (prio 0, RW): alias pci-hole64 @pci 
0001-4000
I/O
- (prio 0, RW): io
  -0007 (prio 0, RW): dma-chan
  0008-000f (prio 0, RW): dma-cont
  0020-0021 (prio 0,

Re: [Qemu-devel] [PATCH V17 8/9] qemu-option: make qemu_opts_del accept opts being NULL

2013-07-17 Thread Dong Xu Wang

Please ignore patch 8 and patch9, it won't work and I will re-send
them seperately.
Sorry for the inconvenience.

On Wed, Jul 17, 2013 at 5:29 PM, Dong Xu Wang
wdon...@linux.vnet.ibm.com wrote:
 Signed-off-by: Dong Xu Wang wdon...@linux.vnet.ibm.com
 ---
  util/qemu-option.c | 4 
  1 file changed, 4 insertions(+)

 diff --git a/util/qemu-option.c b/util/qemu-option.c
 index 7545486..f4a0282 100644
 --- a/util/qemu-option.c
 +++ b/util/qemu-option.c
 @@ -656,6 +656,10 @@ void qemu_opts_del(QemuOpts *opts)
  {
  QemuOpt *opt;

 +if (!opts) {
 +return;
 +}
 +
  for (;;) {
  opt = QTAILQ_FIRST(opts-head);
  if (opt == NULL)
 --
 1.7.11.7

Re: [Qemu-devel] [PATCH v2 00/13] dataplane: use block layer

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 11:22, Stefan Hajnoczi ha scritto:
 Stopping the dataplane event loop is not really necessary for
 correctness; that only requires stopping the ioeventfd, and indeed this
 series already includes patches for this.  So you definitely have my
 approval (in quotes because you're the maintainer...) for patches 1-2-3.
 
 The TLS AioContext approach is compatible with the future improvements
 that you mentioned.

Yes, it is.  But I'd rather try and understand whether we need it,
before committing it.

 The point of TLS AioContext is to avoid explicitly passing AioContext
 everywhere.  Since BH is sometimes used deep down it would be very
 tedious to start passing around AioContext.  qemu_bh_*() should just do
 the right thing.

Note that only aio_bh_new() requires the AioContext.  Scheduling the BH
or deleting doesn't.

 BTW, besides stopping ioeventfd it is also necessary to either complete
 pending I/O requests (drain) or to at least postpone callbacks for
 pending I/O requests while another thread is accessing the BDS.

Yes, draining is cool, I omitted it for simplicity because it happens
also for non-dataplane.  In fact, this is not dataplane-specific, no?
So let's go with what we've been using so far---I agree that postponing
risks getting thorny.

 However, I'm not sure it is feasible to have a single AioContext per
 thread.  Consider a backup job whose target is not running in the same
 AioContext as the source; for optimization it might use bdrv_aio_*
 instead of coroutine-based I/O.

 So, once you have stopped the ioeventfd to correctly support
 bdrv_drain_all(), I would like to see code that makes other threads use
 bottom halves to do a context switch.  Ping Fan's patches for a
 thread-safe BH list are useful for this.
 
 If I understand correctly this is means a block job wraps I/O calls so
 that they are scheduled in a remote AioContext/thread when necessary.
 This is necessary when a block job touches multiple BDS which belong to
 different threads.

Yes.  I didn't consider that at this point you don't even need to put
block jobs in the AioContext's thread, but you're right.  They can run
in the iothread.

 I think there are two ways to implement synchronous operations, there
 are two approaches:

 * A lock, just like Kevin mentioned (though I would place it on the
 AioContext, not the BlockDriverState).  Then you can call aio_wait under
 the lock in bdrv_read and friends.
 
 This sounds clever.  So block jobs don't need to explicitly wrap I/O
 calls, it happens automatically in block.c:bdrv_read() and friends.
 
 * A complementary bottom half that is scheduled in the
 qemu_aio_context.  This second bottom half terminates processing in the
 dataplane thread and restarts the thread starting the synchronous
 operation.  I'm not sure how easy it would be to write infrastructure
 for this, but basically it'd mean adding a third path (in coroutine
 context, but in the wrong AioContext) to the current if
 (qemu_in_coroutine()) tests that block.c has.
 
 Not sure I understand this approach but maybe something like moving a
 coroutine from one AioContext to another and back again.

Yes.  I'm not sure which is best, of course.

 FWIW I'm also looking at how much (little) effort it would take to
 disable dataplane temporarily while block jobs are running.  Since we're
 discussing switching between threads already, it's not clear that we
 gain much performance by trying to use dataplane.  It clearly adds
 complexity.

Long term I'd like to remove the non-dataplane code altogether.  I.e.
the ioeventfd is started at the first kick and stopped at drain, but
apart from that all virtio-blk I/O uses in the dataplane thread's event
loop, always.

It needs ioeventfd support in TCG, though.

Paolo

[Qemu-devel] [PATCH v2] block: fix vvfat error path for enable_write_target

2013-07-17 Thread Fam Zheng

s-qcow and s-qcow_filename are allocated but not freed on error. Fix the
possible leaks, remove unnecessary check for bdrv_new(), propagate ret code of
bdrv_create() and also the one of enable_write_target().

Signed-off-by: Fam Zheng f...@redhat.com
---

v2: Fix leak of s-qcow_filename, propagate returen value of
enable_write_target(). [Laszlo]

---
 block/vvfat.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 87b0279..cd3b8ed 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1164,8 +1164,8 @@ DLOG(if (stderr == NULL) {
 s-sector_count = cyls * heads * secs - (s-first_sectors_number - 1);
 
 if (qemu_opt_get_bool(opts, rw, false)) {
-if (enable_write_target(s)) {
-ret = -EIO;
+ret = enable_write_target(s);
+if (ret  0) {
 goto fail;
 }
 bs-read_only = 0;
@@ -2917,9 +2917,7 @@ static int enable_write_target(BDRVVVFATState *s)
 s-qcow_filename = g_malloc(1024);
 ret = get_tmp_filename(s-qcow_filename, 1024);
 if (ret  0) {
-g_free(s-qcow_filename);
-s-qcow_filename = NULL;
-return ret;
+goto err;
 }
 
 bdrv_qcow = bdrv_find_format(qcow);
@@ -2927,18 +2925,18 @@ static int enable_write_target(BDRVVVFATState *s)
 set_option_parameter_int(options, BLOCK_OPT_SIZE, s-sector_count * 512);
 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, fat:);
 
-if (bdrv_create(bdrv_qcow, s-qcow_filename, options)  0)
-   return -1;
+ret = bdrv_create(bdrv_qcow, s-qcow_filename, options);
+if (ret  0) {
+goto err;
+}
 
 s-qcow = bdrv_new();
-if (s-qcow == NULL) {
-return -1;
-}
 
 ret = bdrv_open(s-qcow, s-qcow_filename, NULL,
 BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
 if (ret  0) {
-   return ret;
+bdrv_delete(s-qcow);
+goto err;
 }
 
 #ifndef _WIN32
@@ -2951,6 +2949,11 @@ static int enable_write_target(BDRVVVFATState *s)
 *(void**)s-bs-backing_hd-opaque = s;
 
 return 0;
+
+err:
+g_free(s-qcow_filename);
+s-qcow_filename = NULL;
+return ret;
 }
 
 static void vvfat_close(BlockDriverState *bs)
-- 
1.8.3.2

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 10:46, Kevin Wolf ha scritto:
 Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.

 Signed-off-by: Peter Lieven p...@kamp.de
 
 Wouldn't it be better to use bdrv_write_zeroes() and extend the
 implementation of that to use discard internally in those block drivers
 where it makes sense?
 
 Because here you're not really discarding (i.e. don't care about the
 sectors any more), but you want them to be zeroed.

I thought the same yesterday when reviewing the series, but I'm not
convinced.

Discarding is not always the right way to write zeroes, because it can
disrupt performance.  It may be fine when you are already going to write
a sparse image (as is the case for qemu-img convert), but not in
general.  So if you just used write_zeroes, it would have to fall under
yet another -drive option (or an extension to -drive discard).  I
think what Peter did is a good compromise in the end.

BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
zeroes blocks, but is that true for unaligned operations?

Paolo

[Qemu-devel] [PATCH V5 01/12] NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions

2013-07-17 Thread Wanlong Gao

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 qapi-schema.json | 44 
 1 file changed, 44 insertions(+)

diff --git a/qapi-schema.json b/qapi-schema.json
index 7b9fef1..f753a35 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3679,3 +3679,47 @@
 '*cpuid-input-ecx': 'int',
 'cpuid-register': 'X86CPURegister32',
 'features': 'int' } }
+
+##
+# @NumaOptions
+#
+# A discriminated record of NUMA options.
+#
+# Since 1.6
+##
+{ 'union': 'NumaOptions',
+  'data': {
+'node':'NumaNodeOptions',
+'mem': 'NumaMemOptions' }}
+
+##
+# @NumaNodeOptions
+#
+# Create a guest NUMA node.
+#
+# @nodeid: #optional NUMA node ID
+#
+# @cpus: #optional VCPUs belong to this node
+#
+# Since: 1.6
+##
+{ 'type': 'NumaNodeOptions',
+  'data': {
+   '*nodeid':  'int',
+   '*cpus':'str' }}
+
+##
+# @NumaMemOptions
+#
+# Set memory information of guest NUMA node.
+#
+# @nodeid: #optional NUMA node ID
+#
+# @size: #optional memory size of this node
+#
+# Since 1.6
+##
+{ 'type': 'NumaMemOptions',
+  'data': {
+   '*nodeid':  'int',
+   '*size':'size' }}
-- 
1.8.3.2.634.g7a3187e

[Qemu-devel] [PULL 2/5] pci: fix BRDIGE typo

2013-07-17 Thread Michael S. Tsirkin

From: Don Koch dk...@verizon.com

Fix typo in macro name: PCI_CLASS_BRDIGE_PCI_INF_SUB.

Signed-off-by: Don Koch dk...@verizon.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci-bridge/i82801b11.c | 2 +-
 include/hw/pci/pci_ids.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/pci-bridge/i82801b11.c b/hw/pci-bridge/i82801b11.c
index 5807a92..b98bfb0 100644
--- a/hw/pci-bridge/i82801b11.c
+++ b/hw/pci-bridge/i82801b11.c
@@ -69,7 +69,7 @@ static int i82801b11_bridge_initfn(PCIDevice *d)
 if (rc  0) {
 goto err_bridge;
 }
-pci_config_set_prog_interface(d-config, PCI_CLASS_BRDIGE_PCI_INF_SUB);
+pci_config_set_prog_interface(d-config, PCI_CLASS_BRIDGE_PCI_INF_SUB);
 return 0;
 
 err_bridge:
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 08f8161..d7933bf 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -39,7 +39,7 @@
 #define PCI_CLASS_BRIDGE_HOST0x0600
 #define PCI_CLASS_BRIDGE_ISA 0x0601
 #define PCI_CLASS_BRIDGE_PCI 0x0604
-#define PCI_CLASS_BRDIGE_PCI_INF_SUB 0x01
+#define PCI_CLASS_BRIDGE_PCI_INF_SUB 0x01
 #define PCI_CLASS_BRIDGE_OTHER   0x0680
 
 #define PCI_CLASS_COMMUNICATION_SERIAL   0x0700
-- 
MST

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Fabien Chouteau

On 07/16/2013 07:50 PM, Scott Wood wrote:
 On 07/16/2013 10:28:28 AM, Fabien Chouteau wrote:
 On 07/16/2013 04:06 AM, Scott Wood wrote:
  On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
  +obj-$(CONFIG_ETSEC) += etsec.o etsec_registers.o etsec_rings.o 
  etsec_miim.o
 
  Maybe an fsl_etsec directory?
 

 Is it really necessary?
 
 Not strictly necessary, but it would help keep things tidy.
 

I will try :)

  +DeviceState *etsec_create(hwaddr base,
  +  MemoryRegion * mr,
  +  NICInfo  * nd,
  +  qemu_irq   tx_irq,
  +  qemu_irq   rx_irq,
  +  qemu_irq   err_irq)
 
  Do you plan to update hw/ppc/e500.c (or maybe some other platform?) to
  call this?
 

 No I don't, not for the moment. I use it in one of our machine (that is not 
 in mainstream).
 
 Someone should, so we can test this without your out-of-tree code.
 
 e500.c would require PCI support I think.
 
 e500.c has PCI support, but how is that relevant to eTSEC?

I guess it's not. We would have to remove:

if (pci_bus) {
/* Register network interfaces. */
for (i = 0; i  nb_nics; i++) {
pci_nic_init_nofail(nd_table[i], pci_bus, virtio, NULL);
}
}

 
  +if (*size == etsec-rx_padding) {
  +/* The remaining bytes are for padding which is not actually 
  allocated
  +   in the buffer */
  +
  +rem = MIN(etsec-regs[MRBLR].value - bd-length, 
  etsec-rx_padding);
  +
  +if (rem  0) {
  +memset(padd, 0x0, sizeof(padd));
  +etsec-rx_padding -= rem;
  +*size -= rem;
  +bd-length+= rem;
  +cpu_physical_memory_write(bufptr, padd, rem);
  +}
  +}
 
  What if *size  0  *size  etsec-rx_padding?

 I don't think it's possible...
 
 Maybe throw in an assertion, then?
 
 I can see how it might not be possible if rx_padding is being used for 
 padding a short frame, since MRBLR must be a multiple of 64, but what if it's 
 4 bytes for CRC?
 

Can you explain a possible error scenario?

  +static void rx_init_frame(eTSEC *etsec, const uint8_t *buf, size_t size)
  +{
  +uint32_t fcb_size = 0;
  +uint8_t  prsdep   = (etsec-regs[RCTRL].value  RCTRL_PRSDEP_OFFSET)
  + RCTRL_PRSDEP_MASK;
  +
  +if (prsdep != 0) {
  +/* Prepend FCB */
  +fcb_size = 8 + 2;  /* FCB size + align */
  +/* I can't find this 2 bytes alignement in fsl documentation but 
  VxWorks
  +   expects them */
 
  Could these 2 bytes be from RCTRL[PAD], which you seem to ignore?

 Did you mean RCTRL[PAL]? It could be that.
 
 Yes, sorry.
 

Thanks for pointing it ;)

  +etsec-rx_padding= 4;

 CRC.

  +if (size  60) {
  +etsec-rx_padding += 60 - size;
  +}
 
  Could you explain what you're doing with rx_padding?

 Avoiding short frames. I'll add a comments.
 
 This is for when RCTRL[RSF] is set, to accept short frames?  Is it documented 
 anywhere that these frames are zero-padded to 64 bytes on rx?  If not, is 
 this the observed behavior of real hardware?
 
 In any case, please add some comments.
 

Actually, this might be obsolete. At the time QEMU was sending short
frames, but vxWorks drop them, so I made this change to always have 64B
frames. Now I have another patch to fix this and I will remove this hack
and implement RCTRL[RSF].


  +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
  +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
  +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
  ~0x7;
 
  What about RBDBPH (upper bits of physical address)?  Likewise for TX.
 

 I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH or 
 TBDBPH.

 When adding code to mainline, it's about more than what you're personally 
 interested in...


If I'm not personally interested, I will not have time to develop or
test a feature. If someone wants to do 36bit addressing, feel free to do
it, that's why I send the patch on QEMU-devel, but I won't implement
full support of eTSEC myself...

 Could you at least have a way to diagnose when the guest OS tries to
 use some functionality that you don't support, rather than silently
 doing the wrong thing?


This device is so complex, detecting unsupported features would take too
much work.

Regards,

-- 
Fabien Chouteau

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Peter Lieven


Am 17.07.2013 um 11:58 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 17/07/2013 10:46, Kevin Wolf ha scritto:
 Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.
 
 Signed-off-by: Peter Lieven p...@kamp.de
 
 Wouldn't it be better to use bdrv_write_zeroes() and extend the
 implementation of that to use discard internally in those block drivers
 where it makes sense?
 
 Because here you're not really discarding (i.e. don't care about the
 sectors any more), but you want them to be zeroed.
 
 I thought the same yesterday when reviewing the series, but I'm not
 convinced.
 
 Discarding is not always the right way to write zeroes, because it can
 disrupt performance.  It may be fine when you are already going to write
 a sparse image (as is the case for qemu-img convert), but not in
 general.  So if you just used write_zeroes, it would have to fall under
 yet another -drive option (or an extension to -drive discard).  I
 think what Peter did is a good compromise in the end.
 
 BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
 zeroes blocks, but is that true for unaligned operations?

Good question, I will pass it to ronnie. My guess is that the command will fail 
with
a check condition if it failed to unmap the data. From what Ronnie sent earlier
it should be guaranteed that the blocks are at least zero after the unmap 
command.

As for the qemu-img patch this shouldn't matter. It uses always blocks of 
bdi-max_unmap
which should be a multiple of the alignment. It also checks if sectors are 
deallocated
after the unmap afterwards. If the unmap fails it falls back to has_zero_init 
=1.

But for the write_zeroes patch of my iscsi series it might be better to check 
after the discard
if the sectors are unallocated and otherways fail with -ENOTSUP. Just to be 
absolutely safe.

Peter

Re: [Qemu-devel] [PATCH v2] block: fix vvfat error path for enable_write_target

2013-07-17 Thread Laszlo Ersek

On 07/17/13 11:57, Fam Zheng wrote:
 s-qcow and s-qcow_filename are allocated but not freed on error. Fix the
 possible leaks, remove unnecessary check for bdrv_new(), propagate ret code of
 bdrv_create() and also the one of enable_write_target().
 
 Signed-off-by: Fam Zheng f...@redhat.com
 ---
 
 v2: Fix leak of s-qcow_filename, propagate returen value of
 enable_write_target(). [Laszlo]
 
 ---
  block/vvfat.c | 25 ++---
  1 file changed, 14 insertions(+), 11 deletions(-)

Looks good to me.

Reviewed-by: Laszlo Ersek ler...@redhat.com

Re: [Qemu-devel] [PATCH 2/3] migration: notify migration state before starting thread

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 11:35, Stefan Hajnoczi ha scritto:
 The migration thread runs outside the QEMU global mutex when possible.
 Therefore we must notify migration state change *before* starting the
 migration thread.
 
 This allows registered listeners to act before live migration iterations
 begin.  Therefore they can get into a state that allows for live
 migration.  When the migration thread starts everything will be ready.
 
 Signed-off-by: Stefan Hajnoczi stefa...@redhat.com
 ---
  migration.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/migration.c b/migration.c
 index 9f5a423..b4daf13 100644
 --- a/migration.c
 +++ b/migration.c
 @@ -625,7 +625,9 @@ void migrate_fd_connect(MigrationState *s)
  qemu_file_set_rate_limit(s-file,
   s-bandwidth_limit / XFER_LIMIT_RATIO);
  
 +/* Notify before starting migration thread */
 +notifier_list_notify(migration_state_notifiers, s);
 +
  qemu_thread_create(s-thread, migration_thread, s,
 QEMU_THREAD_JOINABLE);
 -notifier_list_notify(migration_state_notifiers, s);
  }
 

Acked-by: Paolo Bonzini pbonz...@redhat.com

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Alexander Graf


On 17.07.2013, at 12:17, Fabien Chouteau wrote:

 On 07/16/2013 07:50 PM, Scott Wood wrote:
 On 07/16/2013 10:28:28 AM, Fabien Chouteau wrote:
 On 07/16/2013 04:06 AM, Scott Wood wrote:
 On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
 +obj-$(CONFIG_ETSEC) += etsec.o etsec_registers.o etsec_rings.o 
 etsec_miim.o
 
 Maybe an fsl_etsec directory?
 
 
 Is it really necessary?
 
 Not strictly necessary, but it would help keep things tidy.
 
 
 I will try :)
 
 +DeviceState *etsec_create(hwaddr base,
 +  MemoryRegion * mr,
 +  NICInfo  * nd,
 +  qemu_irq   tx_irq,
 +  qemu_irq   rx_irq,
 +  qemu_irq   err_irq)
 
 Do you plan to update hw/ppc/e500.c (or maybe some other platform?) to
 call this?
 
 
 No I don't, not for the moment. I use it in one of our machine (that is not 
 in mainstream).
 
 Someone should, so we can test this without your out-of-tree code.
 
 e500.c would require PCI support I think.
 
 e500.c has PCI support, but how is that relevant to eTSEC?
 
 I guess it's not. We would have to remove:
 
if (pci_bus) {
/* Register network interfaces. */
for (i = 0; i  nb_nics; i++) {
pci_nic_init_nofail(nd_table[i], pci_bus, virtio, NULL);
}
}

Well, this code is pretty broken anyways. It should honor the IF type that we 
request from it. Then we can also add eTSEC as one.

 
 
 +if (*size == etsec-rx_padding) {
 +/* The remaining bytes are for padding which is not actually 
 allocated
 +   in the buffer */
 +
 +rem = MIN(etsec-regs[MRBLR].value - bd-length, 
 etsec-rx_padding);
 +
 +if (rem  0) {
 +memset(padd, 0x0, sizeof(padd));
 +etsec-rx_padding -= rem;
 +*size -= rem;
 +bd-length+= rem;
 +cpu_physical_memory_write(bufptr, padd, rem);
 +}
 +}
 
 What if *size  0  *size  etsec-rx_padding?
 
 I don't think it's possible...
 
 Maybe throw in an assertion, then?
 
 I can see how it might not be possible if rx_padding is being used for 
 padding a short frame, since MRBLR must be a multiple of 64, but what if 
 it's 4 bytes for CRC?
 
 
 Can you explain a possible error scenario?
 
 +static void rx_init_frame(eTSEC *etsec, const uint8_t *buf, size_t size)
 +{
 +uint32_t fcb_size = 0;
 +uint8_t  prsdep   = (etsec-regs[RCTRL].value  RCTRL_PRSDEP_OFFSET)
 + RCTRL_PRSDEP_MASK;
 +
 +if (prsdep != 0) {
 +/* Prepend FCB */
 +fcb_size = 8 + 2;  /* FCB size + align */
 +/* I can't find this 2 bytes alignement in fsl documentation but 
 VxWorks
 +   expects them */
 
 Could these 2 bytes be from RCTRL[PAD], which you seem to ignore?
 
 Did you mean RCTRL[PAL]? It could be that.
 
 Yes, sorry.
 
 
 Thanks for pointing it ;)
 
 +etsec-rx_padding= 4;
 
 CRC.
 
 +if (size  60) {
 +etsec-rx_padding += 60 - size;
 +}
 
 Could you explain what you're doing with rx_padding?
 
 Avoiding short frames. I'll add a comments.
 
 This is for when RCTRL[RSF] is set, to accept short frames?  Is it 
 documented anywhere that these frames are zero-padded to 64 bytes on rx?  If 
 not, is this the observed behavior of real hardware?
 
 In any case, please add some comments.
 
 
 Actually, this might be obsolete. At the time QEMU was sending short
 frames, but vxWorks drop them, so I made this change to always have 64B
 frames. Now I have another patch to fix this and I will remove this hack
 and implement RCTRL[RSF].
 
 
 +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
 +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
 +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
 ~0x7;
 
 What about RBDBPH (upper bits of physical address)?  Likewise for TX.
 
 
 I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH or 
 TBDBPH.
 
 When adding code to mainline, it's about more than what you're personally 
 interested in...
 
 
 If I'm not personally interested, I will not have time to develop or
 test a feature. If someone wants to do 36bit addressing, feel free to do
 it, that's why I send the patch on QEMU-devel, but I won't implement
 full support of eTSEC myself...
 
 Could you at least have a way to diagnose when the guest OS tries to
 use some functionality that you don't support, rather than silently
 doing the wrong thing?
 
 
 This device is so complex, detecting unsupported features would take too
 much work.

In this case a simple if(upper bits of addresses) hw_error(...) would be good 
enough :). It's just to make sure that all this valuable knowledge doesn't get 
lost any someone who eventually does run into 36bit addressing doesn't have to 
debug for a week to figure out who randomly overwrites his kernel ;).


Alex

Re: [Qemu-devel] [PATCH 0/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Peter Lieven


Am 16.07.2013 um 13:55 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 16/07/2013 13:40, Peter Lieven ha scritto:
 
 The conflict with your block status patches can't be large.
 upstream/master has no
 iscsi_co_is_allocated yet, so there should be no trouble.
 
 Yes, whoever goes second has to change it to the new get_block_status
 API.  Kevin and Stefan can decide who that lucky guy is.

Wouldn't you be able to merge it into scsi/next ?

Peter

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Peter Lieven


Am 17.07.2013 um 10:46 schrieb Kevin Wolf kw...@redhat.com:

 Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.
 
 Signed-off-by: Peter Lieven p...@kamp.de
 
 Wouldn't it be better to use bdrv_write_zeroes() and extend the
 implementation of that to use discard internally in those block drivers
 where it makes sense?
 
 Because here you're not really discarding (i.e. don't care about the
 sectors any more), but you want them to be zeroed.

It is just a fall back in case we can't decide if has_zero_init is 1 easily.
This general approach has the benefit that it is also good for any host device
that has discard zeroes.

Peter

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Kevin Wolf

Am 17.07.2013 um 11:58 hat Paolo Bonzini geschrieben:
 Il 17/07/2013 10:46, Kevin Wolf ha scritto:
  Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
  if a destination has has_zero_init = 0, but it supports
  discard zeroes use discard to convert the target
  into an all zero device.
 
  Signed-off-by: Peter Lieven p...@kamp.de
  
  Wouldn't it be better to use bdrv_write_zeroes() and extend the
  implementation of that to use discard internally in those block drivers
  where it makes sense?
  
  Because here you're not really discarding (i.e. don't care about the
  sectors any more), but you want them to be zeroed.
 
 I thought the same yesterday when reviewing the series, but I'm not
 convinced.
 
 Discarding is not always the right way to write zeroes, because it can
 disrupt performance.  It may be fine when you are already going to write
 a sparse image (as is the case for qemu-img convert), but not in
 general.  So if you just used write_zeroes, it would have to fall under
 yet another -drive option (or an extension to -drive discard).

Maybe we need a flag to bdrv_write_zeroes() that specifies whether to
use discard or not, kind of like the unmap bit in WRITE SAME.

On the other hand, I think you're right that this is really policy,
and as such shouldn't be hardcoded based on our guesses, but be
configurable.

 I think what Peter did is a good compromise in the end.

At the very least it must become a separate function. img_convert() is
already too big and too deeply nested.

 BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
 zeroes blocks, but is that true for unaligned operations?

SBC-3 Rev. 35e, 5.16 READ CAPACITY (16) command:

A logical block provisioning read zeros (LBPRZ) bit set to one
indicates that, for read commands specifying an unmapped LBA (see
4.7.4.5), the device server returns user data set to zero [...]

So it depends on the block provisioning state of the LBA, not on the
operations that were performed on it.

5.28 UNMAP command:

If the ANCHOR bit in the CDB is set to zero, and the logical unit is
thin provisioned (see 4.7.3.3), then the logical block provisioning
state for each specified LBA:

a) should become deallocated;
b) may become anchored; or
c) may remain unchanged.

So with UNMAP, I think you don't have any guarantees that the LBA
becomes unmapped and therefore zeroed. It could just keep its current
data. No matter whether your request was aligned or not.

Kevin

Re: [Qemu-devel] [PATCH 16/17] block: add default get_block_status implementation for protocols

2013-07-17 Thread Peter Lieven


Am 16.07.2013 um 09:19 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 16/07/2013 08:47, Peter Lieven ha scritto:
 
 @@ -2977,7 +2977,11 @@ static int64_t coroutine_fn
 bdrv_co_get_block_status(BlockDriverState *bs,
if (!bs-drv-bdrv_co_get_block_status) {
  *pnum = nb_sectors;
 -return BDRV_BLOCK_DATA;
 +ret = BDRV_BLOCK_DATA;
 +if (bs-drv-protocol_name) {
 +ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num *
 BDRV_SECTOR_SIZE);
 +}
 +return ret;
  }
ret = bs-drv-bdrv_co_get_block_status(bs, sector_num,
 nb_sectors, pnum);
 I am curious if this is right. Doesn't this mean we say that at offset
 sector_num * BDRV_SECTOR_SIZE are nb_sectors of linear data? This is
 something we do not know for sure.
 
 Only for protocols.  In this case, we do know, or format=raw wouldn't
 work.  This is not propagated up to the actual BDS for the image, except
 for format=raw.

Wouldn't it be better to add this general tweak in to raw_co_is_allocated
rather than at the block level?

What you write above is true, but you will never know what will happen in the
future. For raw this tweak is right, but for anything developed in the future
it might not be and in the end nobody remembers to fix it at this position.

I was just thinking of the has_zero_init = 1 thing. At the time it was written
the code was correct and then came more and more extensions like
extends on a block device or iscsi and nobody remembered.

Peter

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Kevin Wolf

Am 17.07.2013 um 12:21 hat Peter Lieven geschrieben:
 
 Am 17.07.2013 um 11:58 schrieb Paolo Bonzini pbonz...@redhat.com:
 
  Il 17/07/2013 10:46, Kevin Wolf ha scritto:
  Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
  if a destination has has_zero_init = 0, but it supports
  discard zeroes use discard to convert the target
  into an all zero device.
  
  Signed-off-by: Peter Lieven p...@kamp.de
  
  Wouldn't it be better to use bdrv_write_zeroes() and extend the
  implementation of that to use discard internally in those block drivers
  where it makes sense?
  
  Because here you're not really discarding (i.e. don't care about the
  sectors any more), but you want them to be zeroed.
  
  I thought the same yesterday when reviewing the series, but I'm not
  convinced.
  
  Discarding is not always the right way to write zeroes, because it can
  disrupt performance.  It may be fine when you are already going to write
  a sparse image (as is the case for qemu-img convert), but not in
  general.  So if you just used write_zeroes, it would have to fall under
  yet another -drive option (or an extension to -drive discard).  I
  think what Peter did is a good compromise in the end.
  
  BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
  zeroes blocks, but is that true for unaligned operations?
 
 Good question, I will pass it to ronnie. My guess is that the command will 
 fail with
 a check condition if it failed to unmap the data. From what Ronnie sent 
 earlier
 it should be guaranteed that the blocks are at least zero after the unmap 
 command.
 
 As for the qemu-img patch this shouldn't matter. It uses always blocks of 
 bdi-max_unmap
 which should be a multiple of the alignment. It also checks if sectors are 
 deallocated
 after the unmap afterwards. If the unmap fails it falls back to has_zero_init 
 =1.

Well, you use bdrv_discard(), and ignoring discards is valid. Just
another reason to use bdrv_write_zeroes() instead.

Kevin

Re: [Qemu-devel] [PATCH 3/3] dataplane: enable virtio-blk x-data-plane=on live migration

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 11:35, Stefan Hajnoczi ha scritto:
 Although the dataplane thread does not cooperate with dirty memory
 logging yet it's fairly easy to temporarily disable dataplane during
 live migration.  This way virtio-blk can live migrate when
 x-data-plane=on.
 
 The dataplane thread will restart after migration is cancelled or if the
 guest resuming virtio-blk operation after migration completes.
 
 Signed-off-by: Stefan Hajnoczi stefa...@redhat.com
 ---
  hw/block/dataplane/virtio-blk.c | 17 -
  hw/block/virtio-blk.c   | 32 
  include/hw/virtio/virtio-blk.h  |  1 +
  3 files changed, 41 insertions(+), 9 deletions(-)
 
 diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
 index 2faed43..411becc 100644
 --- a/hw/block/dataplane/virtio-blk.c
 +++ b/hw/block/dataplane/virtio-blk.c
 @@ -18,7 +18,6 @@
  #include qemu/error-report.h
  #include hw/virtio/dataplane/vring.h
  #include ioq.h
 -#include migration/migration.h
  #include block/block.h
  #include hw/virtio/virtio-blk.h
  #include virtio-blk.h
 @@ -69,8 +68,6 @@ struct VirtIOBlockDataPlane {
   queue */
  
  unsigned int num_reqs;
 -
 -Error *migration_blocker;
  };
  
  /* Raise an interrupt to signal guest, if necessary */
 @@ -418,6 +415,14 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, 
 VirtIOBlkConf *blk,
  return false;
  }
  
 +/* If dataplane is (re-)enabled while the guest is running there could be
 + * block jobs that can conflict.
 + */
 +if (bdrv_in_use(blk-conf.bs)) {
 +error_report(cannot start dataplane thread while device is in use);
 +return false;
 +}
 +
  fd = raw_get_aio_fd(blk-conf.bs);
  if (fd  0) {
  error_report(drive is incompatible with x-data-plane, 
 @@ -433,10 +438,6 @@ bool virtio_blk_data_plane_create(VirtIODevice *vdev, 
 VirtIOBlkConf *blk,
  /* Prevent block operations that conflict with data plane thread */
  bdrv_set_in_use(blk-conf.bs, 1);
  
 -error_setg(s-migration_blocker,
 -x-data-plane does not support migration);
 -migrate_add_blocker(s-migration_blocker);
 -
  *dataplane = s;
  return true;
  }
 @@ -448,8 +449,6 @@ void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane 
 *s)
  }
  
  virtio_blk_data_plane_stop(s);
 -migrate_del_blocker(s-migration_blocker);
 -error_free(s-migration_blocker);
  bdrv_set_in_use(s-blk-conf.bs, 0);
  g_free(s);
  }
 diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
 index cf12469..cca0c77 100644
 --- a/hw/block/virtio-blk.c
 +++ b/hw/block/virtio-blk.c
 @@ -19,6 +19,7 @@
  #include hw/virtio/virtio-blk.h
  #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
  # include dataplane/virtio-blk.h
 +# include migration/migration.h
  #endif
  #include block/scsi.h
  #ifdef __linux__
 @@ -628,6 +629,34 @@ void virtio_blk_set_conf(DeviceState *dev, VirtIOBlkConf 
 *blk)
  memcpy((s-blk), blk, sizeof(struct VirtIOBlkConf));
  }
  
 +#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
 +/* Disable dataplane thread during live migration since it does not
 + * update the dirty memory bitmap yet.
 + */
 +static void virtio_blk_migration_state_changed(Notifier *notifier, void 
 *data)
 +{
 +VirtIOBlock *s = container_of(notifier, VirtIOBlock,
 +  migration_state_notifier);
 +MigrationState *mig = data;
 +
 +if (migration_is_active(mig)) {
 +if (!s-dataplane) {
 +return;
 +}
 +virtio_blk_data_plane_destroy(s-dataplane);
 +s-dataplane = NULL;
 +} else if (migration_has_finished(mig) ||
 +   migration_has_failed(mig)) {
 +if (s-dataplane) {
 +return;
 +}
 +bdrv_drain_all(); /* complete in-flight non-dataplane requests */
 +virtio_blk_data_plane_create(VIRTIO_DEVICE(s), s-blk,
 + s-dataplane);
 +}
 +}

Perhaps you can call bdrv_set_in_use here (set it to 1 after
destruction, and to 0 before creation), so that you do not need the
check in virtio_blk_data_plane_create?

 +#endif /* CONFIG_VIRTIO_BLK_DATA_PLANE */
 +
  static int virtio_blk_device_init(VirtIODevice *vdev)
  {
  DeviceState *qdev = DEVICE(vdev);
 @@ -664,6 +693,8 @@ static int virtio_blk_device_init(VirtIODevice *vdev)
  virtio_cleanup(vdev);
  return -1;
  }
 +s-migration_state_notifier.notify = virtio_blk_migration_state_changed;
 +add_migration_state_change_notifier(s-migration_state_notifier);
  #endif
  
  s-change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, 
 s);
 @@ -683,6 +714,7 @@ static int virtio_blk_device_exit(DeviceState *dev)
  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
  VirtIOBlock *s = VIRTIO_BLK(dev);
  #ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
 +remove_migration_state_change_notifier(s-migration_state_notifier);

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Fabien Chouteau

On 07/17/2013 10:29 AM, Alexander Graf wrote:
 
 
 Am 17.07.2013 um 10:24 schrieb Fabien Chouteau chout...@adacore.com:
 
 On 07/16/2013 06:54 PM, Scott Wood wrote:
 On 07/16/2013 11:15:51 AM, Fabien Chouteau wrote:
 On 07/16/2013 05:37 PM, Alexander Graf wrote:
 On 07/16/2013 05:28 PM, Fabien Chouteau wrote:
 On 07/16/2013 04:06 AM, Scott Wood wrote:
 On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
 +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
 +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
 +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
 ~0x7;
 What about RBDBPH (upper bits of physical address)?  Likewise for TX.
 I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH 
 or TBDBPH.

 Why? I thought e500mc and above can access more than 32bits of physical 
 address space?

 Yes but this is not emulated by QEMU, right? sizeof (hwaddr) for
 qemu-system-ppc is 8...

 36bit physical is emulated by QEMU.  Currently we put CCSR in a place that 
 would make it difficult to use memory above 4G, but that should change at 
 some point.


 But hwaddr is 32 bits, how could you call cpu_physical_memory_read()? to
 a 36bits address?
 
 8 x 8 = 64, no? :)
 

Oups, I was so sure of myself :)

-- 
Fabien Chouteau

Re: [Qemu-devel] [PATCH 0/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:23, Peter Lieven ha scritto:
 
 Am 16.07.2013 um 13:55 schrieb Paolo Bonzini pbonz...@redhat.com:
 
 Il 16/07/2013 13:40, Peter Lieven ha scritto:

 The conflict with your block status patches can't be large.
 upstream/master has no
 iscsi_co_is_allocated yet, so there should be no trouble.

 Yes, whoever goes second has to change it to the new get_block_status
 API.  Kevin and Stefan can decide who that lucky guy is.
 
 Wouldn't you be able to merge it into scsi/next ?

get_block_status is not work for the SCSI tree.  If you mean making it
easier for you to rebase on top, the v2 I posted yesterday is in branch
block-flags in my github repo.

Paolo

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Peter Lieven


Am 17.07.2013 um 12:27 schrieb Kevin Wolf kw...@redhat.com:

 Am 17.07.2013 um 12:21 hat Peter Lieven geschrieben:
 
 Am 17.07.2013 um 11:58 schrieb Paolo Bonzini pbonz...@redhat.com:
 
 Il 17/07/2013 10:46, Kevin Wolf ha scritto:
 Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.
 
 Signed-off-by: Peter Lieven p...@kamp.de
 
 Wouldn't it be better to use bdrv_write_zeroes() and extend the
 implementation of that to use discard internally in those block drivers
 where it makes sense?
 
 Because here you're not really discarding (i.e. don't care about the
 sectors any more), but you want them to be zeroed.
 
 I thought the same yesterday when reviewing the series, but I'm not
 convinced.
 
 Discarding is not always the right way to write zeroes, because it can
 disrupt performance.  It may be fine when you are already going to write
 a sparse image (as is the case for qemu-img convert), but not in
 general.  So if you just used write_zeroes, it would have to fall under
 yet another -drive option (or an extension to -drive discard).  I
 think what Peter did is a good compromise in the end.
 
 BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
 zeroes blocks, but is that true for unaligned operations?
 
 Good question, I will pass it to ronnie. My guess is that the command will 
 fail with
 a check condition if it failed to unmap the data. From what Ronnie sent 
 earlier
 it should be guaranteed that the blocks are at least zero after the unmap 
 command.
 
 As for the qemu-img patch this shouldn't matter. It uses always blocks of 
 bdi-max_unmap
 which should be a multiple of the alignment. It also checks if sectors are 
 deallocated
 after the unmap afterwards. If the unmap fails it falls back to 
 has_zero_init =1.
 
 Well, you use bdrv_discard(), and ignoring discards is valid. Just
 another reason to use bdrv_write_zeroes() instead.

I use discard and afterwards check the provisioning state again. Just in case 
it silently failed for
whatever reason.

I am ok to have a general algorithm in bdrv_write_zeroes that is used as soon 
as the
block driver return discard_zeroes.

Would you be happy with that? I would tweak it that way that it uses discard to 
write zeroes
and double checks the provisioning status after the discard. If it hasn't 
turned to unmapped
real zeroes are written. I would further add code to honor max_unmap and 
discard_alignment
information.

Peter

Re: [Qemu-devel] [PATCH V5 01/12] NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions

2013-07-17 Thread Laszlo Ersek

comments below

On 07/17/13 11:29, Wanlong Gao wrote:
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 ---
  qapi-schema.json | 44 
  1 file changed, 44 insertions(+)
 
 diff --git a/qapi-schema.json b/qapi-schema.json
 index 7b9fef1..f753a35 100644
 --- a/qapi-schema.json
 +++ b/qapi-schema.json
 @@ -3679,3 +3679,47 @@
  '*cpuid-input-ecx': 'int',
  'cpuid-register': 'X86CPURegister32',
  'features': 'int' } }
 +
 +##
 +# @NumaOptions
 +#
 +# A discriminated record of NUMA options.
 +#
 +# Since 1.6
 +##
 +{ 'union': 'NumaOptions',
 +  'data': {
 +'node':  'NumaNodeOptions',
 +'mem':   'NumaMemOptions' }}
 +
 +##
 +# @NumaNodeOptions
 +#
 +# Create a guest NUMA node.
 +#
 +# @nodeid: #optional NUMA node ID
 +#
 +# @cpus: #optional VCPUs belong to this node
 +#
 +# Since: 1.6
 +##
 +{ 'type': 'NumaNodeOptions',
 +  'data': {
 +   '*nodeid':'int',
 +   '*cpus':  'str' }}
 +

Should we document the format for cpus here too?

 +##
 +# @NumaMemOptions
 +#
 +# Set memory information of guest NUMA node.
 +#
 +# @nodeid: #optional NUMA node ID
 +#
 +# @size: #optional memory size of this node
 +#
 +# Since 1.6
 +##
 +{ 'type': 'NumaMemOptions',
 +  'data': {
 +   '*nodeid':'int',
 +   '*size':  'size' }}
 

Looks good in general but I'm not sure if hardware tabs are allowed (or
usual) in this file.

Thanks
Laszlo

[Qemu-devel] [PATCH V5 02/12] NUMA: split -numa option

2013-07-17 Thread Wanlong Gao

Change -numa option like following as Paolo suggested:
-numa node,nodeid=0,cpus=0-1 \
-numa mem,nodeid=0,size=1G

This new option will make later coming memory hotplug better.
And this new option is implemented using OptsVisitor.

Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
---
 Makefile.target |   2 +-
 include/sysemu/sysemu.h |   3 +
 numa.c  | 164 
 qemu-options.hx |   6 +-
 vl.c| 107 +++
 5 files changed, 182 insertions(+), 100 deletions(-)
 create mode 100644 numa.c

diff --git a/Makefile.target b/Makefile.target
index 9a49852..7e1fddf 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER
 #
 # System emulator target
 ifdef CONFIG_SOFTMMU
-obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
+obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
 obj-y += qtest.o
 obj-y += hw/
 obj-$(CONFIG_FDT) += device_tree.o
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 3caeb66..cf8e6e5 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -132,6 +132,9 @@ extern QEMUClock *rtc_clock;
 extern int nb_numa_nodes;
 extern uint64_t node_mem[MAX_NODES];
 extern unsigned long *node_cpumask[MAX_NODES];
+extern QemuOptsList qemu_numa_opts;
+int numa_init_func(QemuOpts *opts, void *opaque);
+void set_numa_nodes(void);
 
 #define MAX_OPTION_ROMS 16
 typedef struct QEMUOptionRom {
diff --git a/numa.c b/numa.c
new file mode 100644
index 000..da68c4b
--- /dev/null
+++ b/numa.c
@@ -0,0 +1,164 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2013 Fujitsu Ltd.
+ * Author: Wanlong Gao gaowanl...@cn.fujitsu.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include sysemu/sysemu.h
+#include qemu/bitmap.h
+#include qapi-visit.h
+#include qapi/opts-visitor.h
+#include qapi/dealloc-visitor.h
+
+QemuOptsList qemu_numa_opts = {
+.name = numa,
+.implied_opt_name = type,
+.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
+.desc = { { 0 } } /* validated with OptsVisitor */
+};
+
+static int numa_node_parse_cpus(int nodenr, const char *cpus)
+{
+char *endptr;
+unsigned long long value, endvalue;
+
+/* Empty CPU range strings will be considered valid, they will simply
+ * not set any bit in the CPU bitmap.
+ */
+if (!*cpus) {
+return 0;
+}
+
+if (parse_uint(cpus, value, endptr, 10)  0) {
+goto error;
+}
+if (*endptr == '-') {
+if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
+goto error;
+}
+} else if (*endptr == '\0') {
+endvalue = value;
+} else {
+goto error;
+}
+
+if (endvalue = MAX_CPUMASK_BITS) {
+endvalue = MAX_CPUMASK_BITS - 1;
+fprintf(stderr,
+qemu: NUMA: A max of %d VCPUs are supported\n,
+ MAX_CPUMASK_BITS);
+}
+
+if (endvalue  value) {
+goto error;
+}
+
+bitmap_set(node_cpumask[nodenr], value, endvalue-value+1);
+return 0;
+
+error:
+fprintf(stderr, qemu: Invalid NUMA CPU range: %s\n, cpus);
+return -1;
+}
+
+static int numa_node_parse(NumaNodeOptions *opts)
+{
+uint64_t nodenr;
+const char *cpus = NULL;
+
+nodenr = opts-nodeid;
+if (nodenr = MAX_NODES) {
+fprintf(stderr, qemu: Max number of NUMA nodes reached: %d\n,
+(int)nodenr);
+return -1;
+}
+
+cpus = opts-cpus;
+return numa_node_parse_cpus(nodenr, cpus);
+}
+
+static int numa_mem_parse(NumaMemOptions *opts)
+{
+uint64_t nodenr, mem_size;
+
+nodenr = opts-nodeid;
+if (nodenr = MAX_NODES) {
+fprintf(stderr, qemu: Max number of NUMA nodes reached: %d\n,
+(int)nodenr);
+return -1;
+}
+
+mem_size =

Re: [Qemu-devel] [PATCH 16/17] block: add default get_block_status implementation for protocols

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:26, Peter Lieven ha scritto:
 
 Am 16.07.2013 um 09:19 schrieb Paolo Bonzini pbonz...@redhat.com:
 
 Il 16/07/2013 08:47, Peter Lieven ha scritto:

 @@ -2977,7 +2977,11 @@ static int64_t coroutine_fn
 bdrv_co_get_block_status(BlockDriverState *bs,
if (!bs-drv-bdrv_co_get_block_status) {
  *pnum = nb_sectors;
 -return BDRV_BLOCK_DATA;
 +ret = BDRV_BLOCK_DATA;
 +if (bs-drv-protocol_name) {
 +ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num *
 BDRV_SECTOR_SIZE);
 +}
 +return ret;
  }
ret = bs-drv-bdrv_co_get_block_status(bs, sector_num,
 nb_sectors, pnum);
 I am curious if this is right. Doesn't this mean we say that at offset
 sector_num * BDRV_SECTOR_SIZE are nb_sectors of linear data? This is
 something we do not know for sure.

 Only for protocols.  In this case, we do know, or format=raw wouldn't
 work.  This is not propagated up to the actual BDS for the image, except
 for format=raw.
 
 Wouldn't it be better to add this general tweak in to raw_co_is_allocated
 rather than at the block level?

No, this is not a general tweak.  It is a default implementation (it
is protected by if (!bs-drv-bdrv_co_get_block_status).  Other
implementations can do the same, for example raw-posix.c also returns
BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE).

Each layer must return complete data.  If a higher layer wants to query
bs-file, it must obviously filters those flags that still make sense.

In the case of format=raw, all flags make sense.  For other layers, some
flags may not make sense and have to be stripped.

 What you write above is true, but you will never know what will happen in the
 future. For raw this tweak is right, but for anything developed in the future
 it might not be and in the end nobody remembers to fix it at this position.

Does the above explain it better?

Paolo

Re: [Qemu-devel] [PATCH 0/2] changes related to monitor flow control

2013-07-17 Thread Amit Shah

On (Tue) 16 Jul 2013 [20:19:39], Laszlo Ersek wrote:
 When the IO thread calls monitor_flush() repeatedly  quickly in
 succession, outside of callback context, many redundant G_IO_OUT watches
 are installed. (One such caller is the info tlb / tlb_info() HMP
 command which produces a lot of monitor output.)
 
 While this redundancy is no problem in itself, it can trigger -1/EINVAL
 in poll() by growing gpollfds beyond limits. This is a persistent
 condition, causing qemu to spin in the main loop.
 
 Patch #2 corrects this.
 
 My first stab at a fix was patch #1. Although in retrospect probably
 unrelated to the main problem, I'm including it because it should
 qualify as an improvement / cleanup in its own right.
 
 See https://bugzilla.redhat.com/show_bug.cgi?id=970047 for more
 details.

Reviewed-by: Amit Shah amit.s...@redhat.com

Amit

Re: [Qemu-devel] [PATCH 0/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Peter Lieven


Am 17.07.2013 um 12:28 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 17/07/2013 12:23, Peter Lieven ha scritto:
 
 Am 16.07.2013 um 13:55 schrieb Paolo Bonzini pbonz...@redhat.com:
 
 Il 16/07/2013 13:40, Peter Lieven ha scritto:
 
 The conflict with your block status patches can't be large.
 upstream/master has no
 iscsi_co_is_allocated yet, so there should be no trouble.
 
 Yes, whoever goes second has to change it to the new get_block_status
 API.  Kevin and Stefan can decide who that lucky guy is.
 
 Wouldn't you be able to merge it into scsi/next ?
 
 get_block_status is not work for the SCSI tree.  If you mean making it
 easier for you to rebase on top, the v2 I posted yesterday is in branch
 block-flags in my github repo.

That would be ok if the patches are merged first. Otherwise I could ask Kevin
to merge my old series (except the iscsi_co_write_zeroes patch as there
obviously is still room for discussion and improvement) and you could tweak
iscsi_co_is_allocated later?

Kevin, shall I prepare something for you to pull?

Peter

Re: [Qemu-devel] [Qemu-ppc] [PATCH 2/2] Add Enhanced Three-Speed Ethernet Controller (eTSEC)

2013-07-17 Thread Fabien Chouteau

On 07/17/2013 12:22 PM, Alexander Graf wrote:
 
 On 17.07.2013, at 12:17, Fabien Chouteau wrote:
 
 On 07/16/2013 07:50 PM, Scott Wood wrote:
 On 07/16/2013 10:28:28 AM, Fabien Chouteau wrote:
 On 07/16/2013 04:06 AM, Scott Wood wrote:
 On 07/10/2013 12:10:02 PM, Fabien Chouteau wrote:
 +/* ring_base = (etsec-regs[RBASEH].value  0xF)  32; */
 +ring_base += etsec-regs[RBASE0 + ring_nbr].value  ~0x7;
 +start_bd_addr  = bd_addr = etsec-regs[RBPTR0 + ring_nbr].value  
 ~0x7;

 What about RBDBPH (upper bits of physical address)?  Likewise for TX.


 I'm only interested in 32bits address spaces, so RBASEH, TBASEH, RBDBPH or 
 TBDBPH.

 When adding code to mainline, it's about more than what you're personally 
 interested in...


 If I'm not personally interested, I will not have time to develop or
 test a feature. If someone wants to do 36bit addressing, feel free to do
 it, that's why I send the patch on QEMU-devel, but I won't implement
 full support of eTSEC myself...

 Could you at least have a way to diagnose when the guest OS tries to
 use some functionality that you don't support, rather than silently
 doing the wrong thing?


 This device is so complex, detecting unsupported features would take too
 much work.
 
 In this case a simple if(upper bits of addresses) hw_error(...) would be 
 good enough :). It's just to make sure that all this valuable knowledge 
 doesn't get lost any someone who eventually does run into 36bit addressing 
 doesn't have to debug for a week to figure out who randomly overwrites his 
 kernel ;).
 

Since hwaddr is 64bits, I will add RBASEH, TBASEH, RBDBPH and TBDBPH support...

-- 
Fabien Chouteau

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:21, Peter Lieven ha scritto:
  BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
  zeroes blocks, but is that true for unaligned operations?
 Good question, I will pass it to ronnie. My guess is that the command will 
 fail with
 a check condition if it failed to unmap the data. From what Ronnie sent 
 earlier
 it should be guaranteed that the blocks are at least zero after the unmap 
 command.

I'm not so sure about that, SBC says explicitly that a misaligned unmap
request may result in unmap operations on fewer LBAs than requested.

Perhaps it's better to do this:

- if LBPWS=1, use WRITE SAME(16).  For WRITE SAME it is explicit that
if the device server does not deallocate or anchor the LBA, then the
device server shall perform the specified write operation.  Expose the
value of LBPRZ in the discard zeroes field of BlockDriverInfo.

- if LBPWS=0, you can still support discard iff LBPU=1 and use UNMAP.
But if LBPRZ=1 and LBPWS=0, you should not expose the discard zeroes
field of BlockDriverInfo.

 As for the qemu-img patch this shouldn't matter. It uses always blocks of 
 bdi-max_unmap
 which should be a multiple of the alignment.

Also, the remainder itself (which SBC calls UNMAP GRANULARITY
ALIGNMENT) may not be 0.  I think the above trick (only exposing LBPRZ
if you'll use WRITE SAME) will sidestep the problem.  It may also remove
the need to fall back to has_zero_init=1.

Paolo

Re: [Qemu-devel] [PATCH 4/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:27, Kevin Wolf ha scritto:
 Am 17.07.2013 um 12:21 hat Peter Lieven geschrieben:

 Am 17.07.2013 um 11:58 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 17/07/2013 10:46, Kevin Wolf ha scritto:
 Am 15.07.2013 um 12:49 hat Peter Lieven geschrieben:
 if a destination has has_zero_init = 0, but it supports
 discard zeroes use discard to convert the target
 into an all zero device.

 Signed-off-by: Peter Lieven p...@kamp.de

 Wouldn't it be better to use bdrv_write_zeroes() and extend the
 implementation of that to use discard internally in those block drivers
 where it makes sense?

 Because here you're not really discarding (i.e. don't care about the
 sectors any more), but you want them to be zeroed.

 I thought the same yesterday when reviewing the series, but I'm not
 convinced.

 Discarding is not always the right way to write zeroes, because it can
 disrupt performance.  It may be fine when you are already going to write
 a sparse image (as is the case for qemu-img convert), but not in
 general.  So if you just used write_zeroes, it would have to fall under
 yet another -drive option (or an extension to -drive discard).  I
 think what Peter did is a good compromise in the end.

 BTW, Peter and Ronnie: we were assuming that UNMAP with LBPRZ=1 always
 zeroes blocks, but is that true for unaligned operations?

 Good question, I will pass it to ronnie. My guess is that the command will 
 fail with
 a check condition if it failed to unmap the data. From what Ronnie sent 
 earlier
 it should be guaranteed that the blocks are at least zero after the unmap 
 command.

 As for the qemu-img patch this shouldn't matter. It uses always blocks of 
 bdi-max_unmap
 which should be a multiple of the alignment. It also checks if sectors are 
 deallocated
 after the unmap afterwards. If the unmap fails it falls back to 
 has_zero_init =1.
 
 Well, you use bdrv_discard(), and ignoring discards is valid. Just
 another reason to use bdrv_write_zeroes() instead.

He's only using it if discard_zeroes is true in the new BlockDriverInfo.
 We can define the semantics of that bit, and I think defining it as
ignored discards will still write zeroes is a good thing (same as what
SCSI targets do if you use WRITE SAME to do the discard operation).

Paolo

Re: [Qemu-devel] [PATCH 0/4] qemu-img: conditionally discard target on convert

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:40, Peter Lieven ha scritto:
 
 Am 17.07.2013 um 12:28 schrieb Paolo Bonzini pbonz...@redhat.com:
 
 Il 17/07/2013 12:23, Peter Lieven ha scritto:

 Am 16.07.2013 um 13:55 schrieb Paolo Bonzini pbonz...@redhat.com:

 Il 16/07/2013 13:40, Peter Lieven ha scritto:

 The conflict with your block status patches can't be large.
 upstream/master has no
 iscsi_co_is_allocated yet, so there should be no trouble.

 Yes, whoever goes second has to change it to the new get_block_status
 API.  Kevin and Stefan can decide who that lucky guy is.

 Wouldn't you be able to merge it into scsi/next ?

 get_block_status is not work for the SCSI tree.  If you mean making it
 easier for you to rebase on top, the v2 I posted yesterday is in branch
 block-flags in my github repo.
 
 That would be ok if the patches are merged first. Otherwise I could ask Kevin
 to merge my old series (except the iscsi_co_write_zeroes patch as there
 obviously is still room for discussion and improvement) and you could tweak
 iscsi_co_is_allocated later?

I'll look at your old series, I think the conflicts are relatively
trivial.  But I think that this series must wait for 1.7.

Paolo

Re: [Qemu-devel] [PATCH V5 02/12] NUMA: split -numa option

2013-07-17 Thread Laszlo Ersek

I'm reviewing this with respect to opts-visitor usage:

On 07/17/13 11:29, Wanlong Gao wrote:
 Change -numa option like following as Paolo suggested:
 -numa node,nodeid=0,cpus=0-1 \
 -numa mem,nodeid=0,size=1G
 
 This new option will make later coming memory hotplug better.
 And this new option is implemented using OptsVisitor.
 
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 ---
  Makefile.target |   2 +-
  include/sysemu/sysemu.h |   3 +
  numa.c  | 164 
 
  qemu-options.hx |   6 +-
  vl.c| 107 +++
  5 files changed, 182 insertions(+), 100 deletions(-)
  create mode 100644 numa.c
 
 diff --git a/Makefile.target b/Makefile.target
 index 9a49852..7e1fddf 100644
 --- a/Makefile.target
 +++ b/Makefile.target
 @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER
  #
  # System emulator target
  ifdef CONFIG_SOFTMMU
 -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
 +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o
  obj-y += qtest.o
  obj-y += hw/
  obj-$(CONFIG_FDT) += device_tree.o
 diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
 index 3caeb66..cf8e6e5 100644
 --- a/include/sysemu/sysemu.h
 +++ b/include/sysemu/sysemu.h
 @@ -132,6 +132,9 @@ extern QEMUClock *rtc_clock;
  extern int nb_numa_nodes;
  extern uint64_t node_mem[MAX_NODES];
  extern unsigned long *node_cpumask[MAX_NODES];
 +extern QemuOptsList qemu_numa_opts;
 +int numa_init_func(QemuOpts *opts, void *opaque);
 +void set_numa_nodes(void);
  
  #define MAX_OPTION_ROMS 16
  typedef struct QEMUOptionRom {
 diff --git a/numa.c b/numa.c
 new file mode 100644
 index 000..da68c4b
 --- /dev/null
 +++ b/numa.c
 @@ -0,0 +1,164 @@
 +/*
 + * QEMU System Emulator
 + *
 + * Copyright (c) 2013 Fujitsu Ltd.
 + * Author: Wanlong Gao gaowanl...@cn.fujitsu.com
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a 
 copy
 + * of this software and associated documentation files (the Software), to 
 deal
 + * in the Software without restriction, including without limitation the 
 rights
 + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 + * copies of the Software, and to permit persons to whom the Software is
 + * furnished to do so, subject to the following conditions:
 + *
 + * The above copyright notice and this permission notice shall be included in
 + * all copies or substantial portions of the Software.
 + *
 + * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 FROM,
 + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 + * THE SOFTWARE.
 + */
 +
 +#include sysemu/sysemu.h
 +#include qemu/bitmap.h
 +#include qapi-visit.h
 +#include qapi/opts-visitor.h
 +#include qapi/dealloc-visitor.h
 +
 +QemuOptsList qemu_numa_opts = {
 +.name = numa,
 +.implied_opt_name = type,
 +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
 +.desc = { { 0 } } /* validated with OptsVisitor */
 +};

Looks OK.

 +
 +static int numa_node_parse_cpus(int nodenr, const char *cpus)
 +{
 +char *endptr;
 +unsigned long long value, endvalue;
 +
 +/* Empty CPU range strings will be considered valid, they will simply
 + * not set any bit in the CPU bitmap.
 + */
 +if (!*cpus) {
 +return 0;
 +}

I think this can crash, can't it? All option arguments (struct members)
you introduce in 01/12 are optional. For scalar types (boolean, int etc)
you can usually simply check the member itself (field  will have
value 0 if has_ is false), but strings, sub-structs etc. are
generated as pointers, and are NULL if has_ is false.

IOW numa_node_parse() may pass in a NULL cpus if ,cpus=... is
completely absent.

 +
 +if (parse_uint(cpus, value, endptr, 10)  0) {
 +goto error;
 +}
 +if (*endptr == '-') {
 +if (parse_uint_full(endptr + 1, endvalue, 10)  0) {
 +goto error;
 +}
 +} else if (*endptr == '\0') {
 +endvalue = value;
 +} else {
 +goto error;
 +}
 +
 +if (endvalue = MAX_CPUMASK_BITS) {
 +endvalue = MAX_CPUMASK_BITS - 1;
 +fprintf(stderr,
 +qemu: NUMA: A max of %d VCPUs are supported\n,
 + MAX_CPUMASK_BITS);
 +}
 +
 +if (endvalue  value) {
 +goto error;
 +}
 +
 +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1);
 +return 0;
 +
 +error:
 +fprintf(stderr, qemu: Invalid NUMA CPU range: %s\n, cpus);
 +return -1;
 +}

The

Re: [Qemu-devel] [PULL 3/5] exec: Support 64-bit operations in address_space_rw

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 11:50, Markus Armbruster ha scritto:
 Richard Henderson r...@twiddle.net writes:
 
 Honor the implementation maximum access size, and at least check
 the minimum access size.

 Reviewed-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Richard Henderson r...@twiddle.net
 
 Fails for me:
 
 qemu-system-x86_64: /work/armbru/qemu/exec.c:1927: memory_access_size: 
 Assertion `l = access_size_min' failed.

This:

unsigned access_size_min = mr-ops-impl.min_access_size;
unsigned access_size_max = mr-ops-impl.max_access_size;

must be respectively:

unsigned access_size_min = 1;
unsigned access_size_max = mr-ops-valid.max_access_size;

access_size_min can be 1 because erroneous accesses must not crash 
QEMU, they should trigger exceptions in the guest or just return 
garbage (depending on the CPU).  I'm not sure I understand the comment, 
placing a 4-byte field at the last byte of a region makes no sense 
(unless impl.unaligned is true).

access_size_max can be mr-ops-valid.max_access_size because memory.c 
can and will still break accesses bigger than 
mr-ops-impl.max_access_size.

Markus, can you try the minimal patch above?  Or this one that also
does the consequent simplifications.

diff --git a/exec.c b/exec.c
index c99a883..0904283 100644
--- a/exec.c
+++ b/exec.c
@@ -1898,14 +1898,8 @@ static inline bool memory_access_is_direct(MemoryRegion 
*mr, bool is_write)
 
 static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
 {
-unsigned access_size_min = mr-ops-impl.min_access_size;
-unsigned access_size_max = mr-ops-impl.max_access_size;
+unsigned access_size_max = mr-ops-valid.max_access_size;
 
-/* Regions are assumed to support 1-4 byte accesses unless
-   otherwise specified.  */
-if (access_size_min == 0) {
-access_size_min = 1;
-}
 if (access_size_max == 0) {
 access_size_max = 4;
 }
@@ -1922,9 +1916,6 @@ static int memory_access_size(MemoryRegion *mr, unsigned 
l, hwaddr addr)
 if (l  access_size_max) {
 l = access_size_max;
 }
-/* ??? The users of this function are wrong, not supporting minimums larger
-   than the remaining length.  C.f. memory.c:access_with_adjusted_size.  */
-assert(l = access_size_min);
 
 return l;
 }

Paolo

Re: [Qemu-devel] [PATCH V5 01/12] NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions

2013-07-17 Thread Paolo Bonzini

Il 17/07/2013 12:35, Laszlo Ersek ha scritto:
 comments below
 
 On 07/17/13 11:29, Wanlong Gao wrote:
 Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
 ---
  qapi-schema.json | 44 
  1 file changed, 44 insertions(+)

 diff --git a/qapi-schema.json b/qapi-schema.json
 index 7b9fef1..f753a35 100644
 --- a/qapi-schema.json
 +++ b/qapi-schema.json
 @@ -3679,3 +3679,47 @@
  '*cpuid-input-ecx': 'int',
  'cpuid-register': 'X86CPURegister32',
  'features': 'int' } }
 +
 +##
 +# @NumaOptions
 +#
 +# A discriminated record of NUMA options.
 +#
 +# Since 1.6
 +##
 +{ 'union': 'NumaOptions',
 +  'data': {
 +'node': 'NumaNodeOptions',
 +'mem':  'NumaMemOptions' }}
 +
 +##
 +# @NumaNodeOptions
 +#
 +# Create a guest NUMA node.
 +#
 +# @nodeid: #optional NUMA node ID
 +#
 +# @cpus: #optional VCPUs belong to this node
 +#
 +# Since: 1.6
 +##
 +{ 'type': 'NumaNodeOptions',
 +  'data': {
 +   '*nodeid':   'int',
 +   '*cpus': 'str' }}
 +
 
 Should we document the format for cpus here too?

I think so---good catch.

 +##
 +# @NumaMemOptions
 +#
 +# Set memory information of guest NUMA node.
 +#
 +# @nodeid: #optional NUMA node ID
 +#
 +# @size: #optional memory size of this node
 +#
 +# Since 1.6
 +##
 +{ 'type': 'NumaMemOptions',
 +  'data': {
 +   '*nodeid':   'int',
 +   '*size': 'size' }}

 
 Looks good in general but I'm not sure if hardware tabs are allowed (or
 usual) in this file.

Definitely not usual---in fact not used at all, so they're probably not
allowed too.

Paolo

1 2 3 >

1 - 100 of 251 matches

Mail list logo