[Qemu-devel] [PATCH v4 0/3] target-arm: Add a few more S2 MMU input checks

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

This adds the inputsize > pamax check and also fixes the
startlevel checks to apply to the 64bit translations.

Comments welcome!

Cheers,
Edgar

ChangeLog:

v3 -> v4:
* Changed comment regarding our choice to fault
* Rename check_s2_startlevel to check_s2_mmu_setup
* Move inputsize check to check_s2_mmu_setup

v2 -> v3:
* Document pamax arg to check_s2_startlevel

v1 -> v2:
* inputsize > pmax check only applies to AArch64
* Fix commit message typo < should be >

Edgar E. Iglesias (3):
  target-arm: Apply S2 MMU startlevel table size check to AArch64
  target-arm: Rename check_s2_startlevel to check_s2_mmu_setup
  target-arm: Implement the S2 MMU inputsize > pamax check

 target-arm/helper.c | 36 ++--
 1 file changed, 22 insertions(+), 14 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v4 1/3] target-arm: Apply S2 MMU startlevel table size check to AArch64

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

The S2 starting level table size check applies to both AArch32
and AArch64. Move it to common code.

Reviewed-by: Alex Bennée 
Reviewed-by: Peter Maydell 
Signed-off-by: Edgar E. Iglesias 
---
 target-arm/helper.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index ae02486..5d6f297 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -6775,11 +6775,19 @@ typedef enum {
 static bool check_s2_startlevel(ARMCPU *cpu, bool is_aa64, int level,
 int inputsize, int stride)
 {
+const int grainsize = stride + 3;
+int startsizecheck;
+
 /* Negative levels are never allowed.  */
 if (level < 0) {
 return false;
 }
 
+startsizecheck = inputsize - ((3 - level) * stride + grainsize);
+if (startsizecheck < 1 || startsizecheck > stride + 4) {
+return false;
+}
+
 if (is_aa64) {
 unsigned int pamax = arm_pamax(cpu);
 
@@ -6803,20 +6811,12 @@ static bool check_s2_startlevel(ARMCPU *cpu, bool 
is_aa64, int level,
 g_assert_not_reached();
 }
 } else {
-const int grainsize = stride + 3;
-int startsizecheck;
-
 /* AArch32 only supports 4KB pages. Assert on that.  */
 assert(stride == 9);
 
 if (level == 0) {
 return false;
 }
-
-startsizecheck = inputsize - ((3 - level) * stride + grainsize);
-if (startsizecheck < 1 || startsizecheck > stride + 4) {
-return false;
-}
 }
 return true;
 }
-- 
1.9.1




Re: [Qemu-devel] [Qemu-ppc] [PATCH 00/13] cuda: misc fixes and cleanups

2016-01-27 Thread BALATON Zoltan

On Wed, 27 Jan 2016, Hervé Poussineau wrote:
Unfortunately, Finnix doesn't try to access I2C bus, not even to initialize 
it. I suppose it is because OpenBIOS doesn't describe the I2C bus in the 
device tree...


The kernel in Finnix seems to have a driver but unfortunately I2C 
debugging is not enabled so you probably won't get much useful info. The 
best idea in this case is probably to compile a kernel with debugging 
messages for I2C so you get some info on where it fails if it tries to 
find the bus at all.


It could be that the problem is that the device tree does not correctly 
describe something. Real hardware seems to have two i2c aliases related to 
some serial ports that is believed to maybe causing problems with MacOS 9
(see: 
http://web.archive.org/web/20090107145016/http://penguinppc.org/historical/dev-trees-html/g4_agp_400_1.html#.)


ki2c-serial: /pci@f200/@d/mac-io@7/i2c/cereal
and
ui2c-serial: /uni-n/i2c/cereal

but I don't know if any of this is relevant, just mentioning it in the 
hope that some of this might help. (I've also noticed that real 
PowerMac3,1 seems to have a modem connected to one of its serial ports so 
that could also be something MacOS 9 is looking for or tries to use.)


Regards,
BALATON Zoltan

[Qemu-devel] [PATCH v4 2/3] target-arm: Rename check_s2_startlevel to check_s2_mmu_setup

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

Rename check_s2_startlevel to check_s2_mmu_setup in preparation
for additional checks.

Signed-off-by: Edgar E. Iglesias 
---
 target-arm/helper.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 5d6f297..13e9933 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -6763,17 +6763,18 @@ typedef enum {
 } MMUFaultType;
 
 /*
- * check_s2_startlevel
+ * check_s2_mmu_setup
  * @cpu:ARMCPU
  * @is_aa64:True if the translation regime is in AArch64 state
  * @startlevel: Suggested starting level
  * @inputsize:  Bitsize of IPAs
  * @stride: Page-table stride (See the ARM ARM)
  *
- * Returns true if the suggested starting level is OK and false otherwise.
+ * Returns true if the suggested S2 translation parameters are OK and
+ * false otherwise.
  */
-static bool check_s2_startlevel(ARMCPU *cpu, bool is_aa64, int level,
-int inputsize, int stride)
+static bool check_s2_mmu_setup(ARMCPU *cpu, bool is_aa64, int level,
+   int inputsize, int stride)
 {
 const int grainsize = stride + 3;
 int startsizecheck;
@@ -7013,8 +7014,7 @@ static bool get_phys_addr_lpae(CPUARMState *env, 
target_ulong address,
 }
 
 /* Check that the starting level is valid. */
-ok = check_s2_startlevel(cpu, va_size == 64, level,
- inputsize, stride);
+ok = check_s2_mmu_setup(cpu, va_size == 64, level, inputsize, stride);
 if (!ok) {
 /* AArch64 reports these as level 0 faults.
  * AArch32 reports these as level 1 faults.
-- 
1.9.1




[Qemu-devel] [PATCH v4 3/3] target-arm: Implement the S2 MMU inputsize > pamax check

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

Implement the inputsize > pamax check for Stage 2 translations.
This is CONSTRAINED UNPREDICTABLE and we choose to fault.

Signed-off-by: Edgar E. Iglesias 
---
 target-arm/helper.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 13e9933..9f75840 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -6790,6 +6790,7 @@ static bool check_s2_mmu_setup(ARMCPU *cpu, bool is_aa64, 
int level,
 }
 
 if (is_aa64) {
+CPUARMState *env = >env;
 unsigned int pamax = arm_pamax(cpu);
 
 switch (stride) {
@@ -6811,6 +6812,13 @@ static bool check_s2_mmu_setup(ARMCPU *cpu, bool 
is_aa64, int level,
 default:
 g_assert_not_reached();
 }
+
+/* Inputsize checks.  */
+if (inputsize > pamax &&
+(arm_el_is_aa64(env, 1) || inputsize > 40)) {
+/* This is CONSTRAINED UNPREDICTABLE and we choose to fault.  */
+return false;
+}
 } else {
 /* AArch32 only supports 4KB pages. Assert on that.  */
 assert(stride == 9);
-- 
1.9.1




Re: [Qemu-devel] [PATCHv2 06/10] target-ppc: Remove unused mmu models from ppc_tlb_invalidate_one

2016-01-27 Thread David Gibson
On Wed, Jan 27, 2016 at 07:06:26PM +0100, Laurent Vivier wrote:
> On 27/01/2016 11:13, David Gibson wrote:
> > ppc_tlb_invalidate_one() has a big switch handling many different MMU
> > types.  However, most of those branches can never be reached:
> > 
> > It is called from 3 places: from remove_hpte() and h_protect() in
> > spapr_hcall.c (which always has a 64-bit hash MMU type), and from
> > helper_tlbie() in mmu_helper.c.
> > 
> > Calls to helper_tlbie() are generated from gen_tlbiel, gen_tlbiel and
> > gen_tlbiva.  The first two are only used with the PPC_MEM_TLBIE flag,
> > set only with 32-bit or 64-bit hash MMU models, and gen_tlbiva() is
> > used only on 440 and 460 models with the BookE mmu model.
> > 
> > These means the exhaustive list of MMU types which may call
> > ppc_tlb_invalidate_one() is: POWERPC_MMU_SOFT_6xx, POWERPC_MMU_601,
> > POWERPC_MMU_32B, POWERPC_MMU_SOFT_74xx, POWERPC_MMU_64B, POWERPC_MMU_2_03,
> > POWERPC_MMU_2_06, POWERPC_MMU_2_07 and POWERPC_MMU_BOOKE.
> > 
> > Clean up by removing logic for all other MMU types from
> > ppc_tlb_invalidate_one().
> > 
> > Signed-off-by: David Gibson 
> > ---
> >  target-ppc/mmu_helper.c | 20 ++--
> >  1 file changed, 2 insertions(+), 18 deletions(-)
> > 
> > diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> > index c040b17..82ebe5d 100644
> > --- a/target-ppc/mmu_helper.c
> > +++ b/target-ppc/mmu_helper.c
> > @@ -1971,25 +1971,10 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, 
> > target_ulong addr)
> >  ppc6xx_tlb_invalidate_virt(env, addr, 1);
> >  }
> >  break;
> > -case POWERPC_MMU_SOFT_4xx:
> > -case POWERPC_MMU_SOFT_4xx_Z:
> > -ppc4xx_tlb_invalidate_virt(env, addr, env->spr[SPR_40x_PID]);
> 
> This function is now called by no one.

Ah, yes.  Well, actually it was already called by no one, but now it's obvious.

> Perhaps it should move to the
> next patch in helper_tlbiva() (according to your comments) ?

Uh... I'm not exactly sure what you're suggesting.  Moving it to the
next patch doesn't really make sense - this is about the 4xx MMU type
which is *not* the same as the BookE MMU type used on 44x and 46x
(yes, that's confusing - one of the dangers of using an "xx" name).

Hmm.. not sure what to do with this - ppc4xx_tlb_invalidate_virt()
should be removed, but I don't know that it's worth respinning the
whole series just for that.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[Qemu-devel] [RFC 0/1] arm: Setup EL1 and EL2 in AArch64 mode for 64bit Linux boots

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

Hi,

I've seen some regressions in my test runs lately when enabling
EL2 and EL3. It turns out we are not setting RW when booting 64bit
Linux kernels.

In the long run, I'm not sure if this does the right thing, for
example we might need to revisit if we ever want to support direct
loading of 32bit kernels on an AArch64 core.
Anyway, this fixes my testruns for now.

Cheers,
Edgar

Edgar E. Iglesias (1):
  hw/arm: Setup EL1 and EL2 in AArch64 mode for 64bit Linux boots

 hw/arm/boot.c | 2 ++
 1 file changed, 2 insertions(+)

-- 
1.9.1




[Qemu-devel] [RFC 1/1] hw/arm: Setup EL1 and EL2 in AArch64 mode for 64bit Linux boots

2016-01-27 Thread Edgar E. Iglesias
From: "Edgar E. Iglesias" 

When booting Linux on AArch64 enabled cores, setup EL1 and
EL2 to use AArch64.

Signed-off-by: Edgar E. Iglesias 
---
 hw/arm/boot.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 7742dd3..d05a998 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -488,7 +488,9 @@ static void do_cpu_reset(void *opaque)
  * adjust.
  */
 if (env->aarch64) {
+env->cp15.scr_el3 |= SCR_RW;
 if (arm_feature(env, ARM_FEATURE_EL2)) {
+env->cp15.hcr_el2 |= HCR_RW;
 env->pstate = PSTATE_MODE_EL2h;
 } else {
 env->pstate = PSTATE_MODE_EL1h;
-- 
1.9.1




Re: [Qemu-devel] [PULL 07/13] fdc: add pick_drive

2016-01-27 Thread John Snow


On 01/27/2016 07:38 AM, Paolo Bonzini wrote:
> 
> 
> On 25/01/2016 20:41, John Snow wrote:
>> Split apart pick_geometry by creating a pick_drive routine that will only
>> ever called during device bring-up instead of relying on pick_geometry to
>> be used in both cases.
>>
>> With this change, the drive field is changed to be 'write once'. It is
>> not altered after the initialization routines exit.
>>
>> media_validated does not need to be migrated. The target VM
>> will just revalidate the media on post_load anyway.
>>
>> Reviewed-by: Eric Blake 
>> Signed-off-by: John Snow 
>> Message-id: 1453495865-9649-7-git-send-email-js...@redhat.com
> 
> Sorry, this breaks the RHEL6.5 Linux installer CD.  It just hangs at
> floppy detection and finally panics.
> 
> Paolo
> 

Looks like the problem is that after this patch, drives with no inserted
medium no longer populate the diskette geometry fields.

This leads to a problem where Linux attempts to seek to the first sector
on an empty drive. When it reads back the status interrupt information,
it finds that the command has succeeded but the current
head/track/sector values are unchanged, so it tries again. (I think. The
linux floppy code is... well, it certainly _is_.)

fd_seek itself guards against out-of-bounds seeks, even though Hervé
Poussineau patched fdctrl_handle_seek explicitly to allow such
out-of-bound seeks.

The end result is that QEMU "lies" about having done the seek, and Linux
appears to get very, very confused.

It looks as if QEMU tries to keep the current track within a sane
boundary for integrity reasons, but the Linux FDC driver expects to be
able to seek an empty drive. In reality, QEMU must consider current
sector/track to be "untrusted" values, but it currently tries to enforce
them being valid, but there doesn't appear to be any precedent for
refusing/erroring out on a SEEK command.

(Unrelatedly, in trying to fix this, I tried to see what would happen if
on an invalid seek through FD_SEEK alone I set ABNTERM, and it leads the
Linux kernel through a route where it tries to issue
DRIVE_SPECIFICATION, which will also break QEMU/Linux, because
DRIVE_SPECIFICATION can be anywhere between 3-7 bytes, but our version
accepts statically exactly 6 bytes.)

((Even fixing the above, Linux still doesn't appear to take a rejected
SEEK very well, just infinitely resetting, seeking, recalibrating.))

Given the above, it looks like the quick, dumb, and easy way to fix this
for now without risking jostling esoteric hardware will be to just set
dummy geometries on boot for empty drives, like we used to.

Hervé, any thoughts?

--js

>> ---
>>  hw/block/fdc.c | 56 ++--
>>  1 file changed, 46 insertions(+), 10 deletions(-)
>>
>> diff --git a/hw/block/fdc.c b/hw/block/fdc.c
>> index a8f0cf2..f8e070e 100644
>> --- a/hw/block/fdc.c
>> +++ b/hw/block/fdc.c
>> @@ -151,6 +151,7 @@ typedef struct FDrive {
>>  uint8_t media_rate;   /* Data rate of medium*/
>>  
>>  bool media_inserted;  /* Is there a medium in the tray */
>> +bool media_validated; /* Have we validated the media? */
>>  } FDrive;
>>  
>>  static void fd_init(FDrive *drv)
>> @@ -162,6 +163,8 @@ static void fd_init(FDrive *drv)
>>  drv->disk = FLOPPY_DRIVE_TYPE_NONE;
>>  drv->last_sect = 0;
>>  drv->max_track = 0;
>> +drv->ro = true;
>> +drv->media_changed = 1;
>>  }
>>  
>>  #define NUM_SIDES(drv) ((drv)->flags & FDISK_DBL_SIDES ? 2 : 1)
>> @@ -244,13 +247,24 @@ static void fd_recalibrate(FDrive *drv)
>>  fd_seek(drv, 0, 0, 1, 1);
>>  }
>>  
>> -static void pick_geometry(FDrive *drv)
>> +/**
>> + * Determine geometry based on inserted diskette.
>> + * Will not operate on an empty drive.
>> + *
>> + * @return: 0 on success, -1 if the drive is empty.
>> + */
>> +static int pick_geometry(FDrive *drv)
>>  {
>>  BlockBackend *blk = drv->blk;
>>  const FDFormat *parse;
>>  uint64_t nb_sectors, size;
>>  int i, first_match, match;
>>  
>> +/* We can only pick a geometry if we have a diskette. */
>> +if (!drv->media_inserted) {
>> +return -1;
>> +}
>> +
>>  blk_get_geometry(blk, _sectors);
>>  match = -1;
>>  first_match = -1;
>> @@ -290,31 +304,51 @@ static void pick_geometry(FDrive *drv)
>>  }
>>  drv->max_track = parse->max_track;
>>  drv->last_sect = parse->last_sect;
>> -drv->drive = parse->drive;
>> -drv->disk = drv->media_inserted ? parse->drive : FLOPPY_DRIVE_TYPE_NONE;
>> +drv->disk = parse->drive;
>>  drv->media_rate = parse->rate;
>> +return 0;
>> +}
>> +
>> +static void pick_drive_type(FDrive *drv)
>> +{
>> +if (pick_geometry(drv) == 0) {
>> +drv->drive = drv->disk;
>> +} else {
>> +/* Legacy behavior: default to 1.44MB floppy */
>> +drv->drive = FLOPPY_DRIVE_TYPE_144;
>> +}
>>  }
>>  
>>  /* Revalidate a disk drive after a disk change */
>>  

Re: [Qemu-devel] [Qemu-ppc] [PATCH 00/13] cuda: misc fixes and cleanups

2016-01-27 Thread Hervé Poussineau

Le 26/01/2016 12:32, BALATON Zoltan a écrit :

On Mon, 25 Jan 2016, Hervé Poussineau wrote:

Do you have a Linux/NetBSD/... image, where I can run some command line tool to 
probe the I2C bus?


Have you tried the iso from www.finnix.org? When booting it you may see a boot 
prompt on black screen with dark gray text first which is hard to read but 
pressing enter here should go on to boot.



Unfortunately, Finnix doesn't try to access I2C bus, not even to initialize it. 
I suppose it is because OpenBIOS doesn't describe the I2C bus in the device 
tree...

Hervé




Re: [Qemu-devel] [PATCH] net/traffic-mirrorer:Add traffic-mirroer

2016-01-27 Thread Zhang Chen



On 01/28/2016 12:10 AM, Eric Blake wrote:

On 01/26/2016 05:44 PM, Hailiang Zhang wrote:

On 2016/1/26 16:59, Zhang Chen wrote:

From: ZhangChen 

Traffic-mirrorer is a plugin of netfilter.
It make qemu has ability to copy and mirror guest's
net packet. we output packet to chardev.

+static void traffic_mirrorer_setup(NetFilterState *nf, Error **errp)
+{
+MirrorerState *s = FILTER_TRAFFIC_MIRRORER(nf);
+
+if (!s->outdev) {
+error_setg(errp, "filter traffic mirrorer needs 'outdev'
property set!"
+"property set!");

Duplicate 'property set!'.

For that matter, error_setg() messages should never end in '!'.



Thanks, I will fix it in V3
zhangchen


--
Thanks
zhangchen






[Qemu-devel] Running PPC Mac OS X Tiger Apps via Qemu on El Capitan

2016-01-27 Thread Gabriel Diggs
Hello,
I’m trying to get my head around running a PPC App created to run on OS X 10.4 
Tiger on El Capitan 10.11.3. I came across Qemu as a possible solution. The app 
in question is Traktor DJ Studio 3. It’s a Midi and Audio intensive app so 
latency has to be very low and it needs to be able to communicate seamlessly 
with a USB linked midi controller. Is it possible to wrap the app in a Tiger 
flavored Qemu similar to wrapping a Windows app in Wine in El Capitan? If so 
can you direct me to easy to read instructions to get everything setup?

Cheers,
Gabriel

 
  
      
  
  
  





Re: [Qemu-devel] [ipxe-devel] [PATCH ipxe] build: Enable IPv6 for qemu

2016-01-27 Thread Christian Nilsson
On Wed, Jan 27, 2016 at 2:49 PM, Gerd Hoffmann  wrote:
>   Hi,
>
>> We already have the named config mechanism.  I wonder if building a BIOS
>> option ROM for a real NIC is sufficiently specialised that it would make
>> sense to have a CONFIG=rom or CONFIG=minimal named configuration.
>
> Maybe name the configs "rom64k" or "rom128k", to make clear they are
> stripped down to keep the size below a certain limit?
>
> cheers,
>   Gerd

How common is it to build EFI roms, compared to building ipxe.efi or
snponly.efi?
On IRC, roms is quite rare topic compared to non rom builds, but maybe
that's because those that build roms don't have that many questions.
My preferred default would be IPv6 enabled for EFI but not PCBIOS, but
focus on the most frequent build/usecase.

Adding "rom64k" and "rom128k" sounds like a good basics to then
disable features from the normal?

/Christian



[Qemu-devel] [PATCH v6 4/5] acpi: arm: add fw_cfg device node to dsdt

2016-01-27 Thread Gabriel L. Somlo
Add a fw_cfg device node to the ACPI DSDT. This is mostly
informational, as the authoritative fw_cfg MMIO region(s)
are listed in the Device Tree. However, since we are building
ACPI tables, we might as well be thorough while at it...

Signed-off-by: Gabriel Somlo 
Reviewed-by: Laszlo Ersek 
Tested-by: Laszlo Ersek 
Reviewed-by: Marc Marí 
---
 hw/arm/virt-acpi-build.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 87fbe7c..20bbdf2 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -95,6 +95,20 @@ static void acpi_dsdt_add_uart(Aml *scope, const MemMapEntry 
*uart_memmap,
 aml_append(scope, dev);
 }
 
+static void acpi_dsdt_add_fw_cfg(Aml *scope, const MemMapEntry *fw_cfg_memmap)
+{
+Aml *dev = aml_device("FWCF");
+aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002")));
+/* device present, functioning, decoding, not shown in UI */
+aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+Aml *crs = aml_resource_template();
+aml_append(crs, aml_memory32_fixed(fw_cfg_memmap->base,
+   fw_cfg_memmap->size, AML_READ_WRITE));
+aml_append(dev, aml_name_decl("_CRS", crs));
+aml_append(scope, dev);
+}
+
 static void acpi_dsdt_add_flash(Aml *scope, const MemMapEntry *flash_memmap)
 {
 Aml *dev, *crs;
@@ -565,6 +579,7 @@ build_dsdt(GArray *table_data, GArray *linker, 
VirtGuestInfo *guest_info)
 acpi_dsdt_add_uart(scope, [VIRT_UART],
(irqmap[VIRT_UART] + ARM_SPI_BASE));
 acpi_dsdt_add_flash(scope, [VIRT_FLASH]);
+acpi_dsdt_add_fw_cfg(scope, [VIRT_FW_CFG]);
 acpi_dsdt_add_virtio(scope, [VIRT_MMIO],
 (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS);
 acpi_dsdt_add_pci(scope, memmap, (irqmap[VIRT_PCIE] + ARM_SPI_BASE),
-- 
2.4.3




[Qemu-devel] [PATCH v6 5/5] fw_cfg: document ACPI device node information

2016-01-27 Thread Gabriel L. Somlo
Signed-off-by: Gabriel Somlo 
Reviewed-by: Laszlo Ersek 
Reviewed-by: Marc Marí 
---
 docs/specs/fw_cfg.txt | 9 +
 1 file changed, 9 insertions(+)

diff --git a/docs/specs/fw_cfg.txt b/docs/specs/fw_cfg.txt
index 2099ad9..5414140 100644
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -84,6 +84,15 @@ Selector Register address: Base + 8 (2 bytes)
 Data Register address: Base + 0 (8 bytes)
 DMA Address address:   Base + 16 (8 bytes)
 
+== ACPI Interface ==
+
+The fw_cfg device is defined with ACPI ID "QEMU0002". Since we expect
+ACPI tables to be passed into the guest through the fw_cfg device itself,
+the guest-side firmware can not use ACPI to find fw_cfg. However, once the
+firmware is finished setting up ACPI tables and hands control over to the
+guest kernel, the latter can use the fw_cfg ACPI node for a more accurate
+inventory of in-use IOport or MMIO regions.
+
 == Firmware Configuration Items ==
 
 === Signature (Key 0x, FW_CFG_SIGNATURE) ===
-- 
2.4.3




Re: [Qemu-devel] [PATCH v8 03/16] block: Add BB-BDS remove/insert notifiers

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> bdrv_close() no longer signifies ejection of a medium, this is now done
> by removing the BDS from the BB. Therefore, we want to have a notifier
> for that in the BB instead of a close notifier in the BDS. The former is
> added now, the latter is removed later.
> 
> Symmetrically, another notifier list is added that is invoked whenever a
> BDS is inserted. We will need that for virtio-blk and virtio-scsi, which
> can then remove their op blockers on BDS ejection and set them up on
> insertion.
> 
> Signed-off-by: Max Reitz 
> Reviewed-by: Kevin Wolf 
> ---
>  block/block-backend.c  | 20 
>  include/sysemu/block-backend.h |  2 ++
>  2 files changed, 22 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index a4208f1..1872191 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -49,6 +49,8 @@ struct BlockBackend {
>  BlockdevOnError on_read_error, on_write_error;
>  bool iostatus_enabled;
>  BlockDeviceIoStatus iostatus;
> +
> +NotifierList remove_bs_notifiers, insert_bs_notifiers;
>  };
>  
>  typedef struct BlockBackendAIOCB {
> @@ -99,6 +101,8 @@ BlockBackend *blk_new(const char *name, Error **errp)
>  blk = g_new0(BlockBackend, 1);
>  blk->name = g_strdup(name);
>  blk->refcnt = 1;
> +notifier_list_init(>remove_bs_notifiers);
> +notifier_list_init(>insert_bs_notifiers);
>  QTAILQ_INSERT_TAIL(_backends, blk, link);
>  return blk;
>  }
> @@ -167,6 +171,8 @@ static void blk_delete(BlockBackend *blk)
>  bdrv_unref(blk->bs);
>  blk->bs = NULL;
>  }
> +assert(QLIST_EMPTY(>remove_bs_notifiers.notifiers));
> +assert(QLIST_EMPTY(>insert_bs_notifiers.notifiers));
>  if (blk->root_state.throttle_state) {
>  g_free(blk->root_state.throttle_group);
>  throttle_group_unref(blk->root_state.throttle_state);
> @@ -345,6 +351,8 @@ void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend 
> *blk)
>   */
>  void blk_remove_bs(BlockBackend *blk)
>  {
> +notifier_list_notify(>remove_bs_notifiers, blk);
> +
>  blk_update_root_state(blk);
>  
>  blk->bs->blk = NULL;
> @@ -361,6 +369,8 @@ void blk_insert_bs(BlockBackend *blk, BlockDriverState 
> *bs)
>  bdrv_ref(bs);
>  blk->bs = bs;
>  bs->blk = blk;
> +
> +notifier_list_notify(>insert_bs_notifiers, blk);
>  }
>  
>  /*
> @@ -1126,6 +1136,16 @@ void blk_remove_aio_context_notifier(BlockBackend *blk,
>  }
>  }
>  
> +void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
> +{
> +notifier_list_add(>remove_bs_notifiers, notify);
> +}
> +
> +void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
> +{
> +notifier_list_add(>insert_bs_notifiers, notify);
> +}
> +
>  void blk_add_close_notifier(BlockBackend *blk, Notifier *notify)
>  {
>  if (blk->bs) {
> diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
> index 1568554..e12be67 100644
> --- a/include/sysemu/block-backend.h
> +++ b/include/sysemu/block-backend.h
> @@ -164,6 +164,8 @@ void blk_remove_aio_context_notifier(BlockBackend *blk,
>void *),
>   void (*detach_aio_context)(void *),
>   void *opaque);
> +void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify);
> +void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify);
>  void blk_add_close_notifier(BlockBackend *blk, Notifier *notify);
>  void blk_io_plug(BlockBackend *blk);
>  void blk_io_unplug(BlockBackend *blk);
> -- 
> 2.7.0
> 

Reviewed-by: Fam Zheng 



[Qemu-devel] [PATCH] block: Remove unused struct definition BlockFinishData

2016-01-27 Thread Fam Zheng
Unused since 94db6d2d3.

Signed-off-by: Fam Zheng 
---
 blockjob.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 80adb9d..a692142 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -278,14 +278,6 @@ void block_job_iostatus_reset(BlockJob *job)
 }
 }
 
-struct BlockFinishData {
-BlockJob *job;
-BlockCompletionFunc *cb;
-void *opaque;
-bool cancelled;
-int ret;
-};
-
 static int block_job_finish_sync(BlockJob *job,
  void (*finish)(BlockJob *, Error **errp),
  Error **errp)
-- 
2.4.3




Re: [Qemu-devel] [PATCHv2 09/10] target-ppc: Helper to determine page size information from hpte alone

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> h_enter() in the spapr code needs to know the page size of the HPTE
> it's
> about to insert.  Unlike other paths that do this, it doesn't have
> access
> to the SLB, so at the moment it determines this with some open-coded
> tests which assume POWER7 or POWER8 page size encodings.
> 
> To make this more flexible add ppc_hash64_hpte_page_shift_noslb() to
> determine both the "base" page size per segment, and the individual
> effective page size from an HPTE alone.
> 
> This means that the spapr code should now be able to handle any page
> size
> listed in the env->sps table.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

> ---
>  hw/ppc/spapr_hcall.c| 25 ++---
>  target-ppc/mmu-hash64.c | 35 +++
>  target-ppc/mmu-hash64.h |  3 +++
>  3 files changed, 44 insertions(+), 19 deletions(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index dedc7e0..a535c73 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -72,31 +72,18 @@ static target_ulong h_enter(PowerPCCPU *cpu,
> sPAPRMachineState *spapr,
>  target_ulong pte_index = args[1];
>  target_ulong pteh = args[2];
>  target_ulong ptel = args[3];
> -target_ulong page_shift = 12;
> +unsigned apshift, spshift;
>  target_ulong raddr;
>  target_ulong index;
>  uint64_t token;
>  
> -/* only handle 4k and 16M pages for now */
> -if (pteh & HPTE64_V_LARGE) {
> -#if 0 /* We don't support 64k pages yet */
> -if ((ptel & 0xf000) == 0x1000) {
> -/* 64k page */
> -} else
> -#endif
> -if ((ptel & 0xff000) == 0) {
> -/* 16M page */
> -page_shift = 24;
> -/* lowest AVA bit must be 0 for 16M pages */
> -if (pteh & 0x80) {
> -return H_PARAMETER;
> -}
> -} else {
> -return H_PARAMETER;
> -}
> +apshift = ppc_hash64_hpte_page_shift_noslb(cpu, pteh, ptel,
> );
> +if (!apshift) {
> +/* Bad page size encoding */
> +return H_PARAMETER;
>  }
>  
> -raddr = (ptel & HPTE64_R_RPN) & ~((1ULL << page_shift) - 1);
> +raddr = (ptel & HPTE64_R_RPN) & ~((1ULL << apshift) - 1);
>  
>  if (is_ram_address(spapr, raddr)) {
>  /* Regular RAM - should have WIMG=0010 */
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 3284776..19ee942 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -512,6 +512,41 @@ static unsigned hpte_page_shift(const struct
> ppc_one_seg_page_size *sps,
>  return 0; /* Bad page size encoding */
>  }
>  
> +unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
> +  uint64_t pte0, uint64_t
> pte1,
> +  unsigned *seg_page_shift)
> +{
> +CPUPPCState *env = >env;
> +int i;
> +
> +if (!(pte0 & HPTE64_V_LARGE)) {
> +*seg_page_shift = 12;
> +return 12;
> +}
> +
> +/*
> + * The encodings in env->sps need to be carefully chosen so that
> + * this gives an unambiguous result.
> + */
> +for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> +const struct ppc_one_seg_page_size *sps = >sps.sps[i];
> +unsigned shift;
> +
> +if (!sps->page_shift) {
> +break;
> +}
> +
> +shift = hpte_page_shift(sps, pte0, pte1);
> +if (shift) {
> +*seg_page_shift = sps->page_shift;
> +return shift;
> +}
> +}
> +
> +*seg_page_shift = 0;
> +return 0;
> +}
> +
>  int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, target_ulong eaddr,
>  int rwx, int mmu_idx)
>  {
> diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
> index 293a951..34cf975 100644
> --- a/target-ppc/mmu-hash64.h
> +++ b/target-ppc/mmu-hash64.h
> @@ -16,6 +16,9 @@ void ppc_hash64_store_hpte(PowerPCCPU *cpu,
> target_ulong index,
>  void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
> target_ulong pte_index,
> target_ulong pte0, target_ulong
> pte1);
> +unsigned ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
> +  uint64_t pte0, uint64_t
> pte1,
> +  unsigned *seg_page_shift);
>  #endif
>  
>  /*


Re: [Qemu-devel] [PATCHv2 08/10] target-ppc: Add new TLB invalidate by HPTE call for hash64 MMUs

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> When HPTEs are removed or modified by hypercalls on spapr, we need to
> invalidate the relevant pages in the qemu TLB.
> 
> Currently we do that by doing some complicated calculations to work out the
> right encoding for the tlbie instruction, then passing that to
> ppc_tlb_invalidate_one()... which totally ignores the argument and flushes
> the whole tlb.
> 
> Avoid that by adding a new flush-by-hpte helper in mmu-hash64.c.

Should we find a better "in between" so long run we implement tlbie
properly ? IE, tlbie will give us the page size using the same encoding
as the HPTE iirc when L=1 ? To be honest the encoding of tlbie in arch
2.07 is so completely insane I have a hard time figuring it out myself
... :-)

Otherwise,

Acked-by: Benjamin Herrenschmidt 

> Signed-off-by: David Gibson 
> ---
>  hw/ppc/spapr_hcall.c| 46 ---
> ---
>  target-ppc/mmu-hash64.c | 12 
>  target-ppc/mmu-hash64.h |  3 +++
>  3 files changed, 19 insertions(+), 42 deletions(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 4707196..dedc7e0 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -37,42 +37,6 @@ static void set_spr(CPUState *cs, int spr,
> target_ulong value,
>  run_on_cpu(cs, do_spr_sync, );
>  }
>  
> -static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r,
> - target_ulong pte_index)
> -{
> -target_ulong rb, va_low;
> -
> -rb = (v & ~0x7fULL) << 16; /* AVA field */
> -va_low = pte_index >> 3;
> -if (v & HPTE64_V_SECONDARY) {
> -va_low = ~va_low;
> -}
> -/* xor vsid from AVA */
> -if (!(v & HPTE64_V_1TB_SEG)) {
> -va_low ^= v >> 12;
> -} else {
> -va_low ^= v >> 24;
> -}
> -va_low &= 0x7ff;
> -if (v & HPTE64_V_LARGE) {
> -rb |= 1; /* L field */
> -#if 0 /* Disable that P7 specific bit for now */
> -if (r & 0xff000) {
> -/* non-16MB large page, must be 64k */
> -/* (masks depend on page size) */
> -rb |= 0x1000;/* page encoding in LP
> field */
> -rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field
> */
> -rb |= (va_low & 0xfe);   /* AVAL field */
> -}
> -#endif
> -} else {
> -/* 4kB page */
> -rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of AVA */
> -}
> -rb |= (v >> 54) & 0x300;/* B field */
> -return rb;
> -}
> -
>  static inline bool valid_pte_index(CPUPPCState *env, target_ulong
> pte_index)
>  {
>  /*
> @@ -198,7 +162,7 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu,
> target_ulong ptex,
>  {
>  CPUPPCState *env = >env;
>  uint64_t token;
> -target_ulong v, r, rb;
> +target_ulong v, r;
>  
>  if (!valid_pte_index(env, ptex)) {
>  return REMOVE_PARM;
> @@ -217,8 +181,7 @@ static RemoveResult remove_hpte(PowerPCCPU *cpu,
> target_ulong ptex,
>  *vp = v;
>  *rp = r;
>  ppc_hash64_store_hpte(cpu, ptex, HPTE64_V_HPTE_DIRTY, 0);
> -rb = compute_tlbie_rb(v, r, ptex);
> -ppc_tlb_invalidate_one(env, rb);
> +ppc_hash64_tlb_flush_hpte(cpu, ptex, v, r);
>  return REMOVE_SUCCESS;
>  }
>  
> @@ -322,7 +285,7 @@ static target_ulong h_protect(PowerPCCPU *cpu,
> sPAPRMachineState *spapr,
>  target_ulong pte_index = args[1];
>  target_ulong avpn = args[2];
>  uint64_t token;
> -target_ulong v, r, rb;
> +target_ulong v, r;
>  
>  if (!valid_pte_index(env, pte_index)) {
>  return H_PARAMETER;
> @@ -343,10 +306,9 @@ static target_ulong h_protect(PowerPCCPU *cpu,
> sPAPRMachineState *spapr,
>  r |= (flags << 55) & HPTE64_R_PP0;
>  r |= (flags << 48) & HPTE64_R_KEY_HI;
>  r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
> -rb = compute_tlbie_rb(v, r, pte_index);
>  ppc_hash64_store_hpte(cpu, pte_index,
>    (v & ~HPTE64_V_VALID) |
> HPTE64_V_HPTE_DIRTY, 0);
> -ppc_tlb_invalidate_one(env, rb);
> +ppc_hash64_tlb_flush_hpte(cpu, pte_index, v, r);
>  /* Don't need a memory barrier, due to qemu's global lock */
>  ppc_hash64_store_hpte(cpu, pte_index, v | HPTE64_V_HPTE_DIRTY,
> r);
>  return H_SUCCESS;
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index ee1e8bf..3284776 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -707,3 +707,15 @@ void ppc_hash64_store_hpte(PowerPCCPU *cpu,
>   env->htab_base + pte_index + HASH_PTE_SIZE_64 / 2,
> pte1);
>  }
>  }
> +
> +void ppc_hash64_tlb_flush_hpte(PowerPCCPU *cpu,
> +   target_ulong pte_index,
> +   target_ulong pte0, target_ulong pte1)
> +{
> +/*
> + * XXX: given the fact that there are too many 

[Qemu-devel] [PATCH v7 10/13] spapr: CPU hotplug support

2016-01-27 Thread Bharata B Rao
Support CPU hotplug via device-add command like this:

(qemu) device_add powerpc64-cpu-core,id=core2

In response to device_add, CPU core device will be created. CPU core
device creates and realizes CPU thread devices. If the machine type
supports CPU hotplug, boot-time CPUs are created as CPU core devices
otherwise they continue to be created as individual CPU devices.

Set up device tree entries for the hotplugged CPU core and use the
exising EPOW event infrastructure to send CPU hotplug notification to
the guest.

Signed-off-by: Bharata B Rao 
---
 hw/ppc/spapr.c  | 145 +---
 hw/ppc/spapr_events.c   |   3 +
 hw/ppc/spapr_rtas.c |  24 
 target-ppc/translate_init.c |   8 +++
 4 files changed, 173 insertions(+), 7 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index eeea411..6ef520d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -63,6 +63,7 @@
 
 #include "hw/compat.h"
 #include "qemu-common.h"
+#include "hw/ppc/cpu-core.h"
 
 #include 
 
@@ -601,6 +602,18 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, 
int offset,
 size_t page_sizes_prop_size;
 uint32_t vcpus_per_socket = smp_threads * smp_cores;
 uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(qdev_get_machine());
+sPAPRDRConnector *drc;
+sPAPRDRConnectorClass *drck;
+int drc_index;
+
+if (smc->dr_cpu_enabled) {
+drc = spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, index);
+g_assert(drc);
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+drc_index = drck->get_index(drc);
+_FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
+}
 
 /* Note: we keep CI large pages off for now because a 64K capable guest
  * provisioned with large pages might otherwise try to map a qemu
@@ -1764,6 +1777,8 @@ static void ppc_spapr_init(MachineState *machine)
 char *filename;
 int smt = kvmppc_smt_threads();
 int smp_max_cores = max_cpus / smp_threads;
+int spapr_smp_cores = smp_cpus / smp_threads;
+Object *core;
 
 msi_supported = true;
 
@@ -1843,13 +1858,22 @@ static void ppc_spapr_init(MachineState *machine)
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : "POWER7";
 }
-for (i = 0; i < smp_cpus; i++) {
-cpu = cpu_ppc_init(machine->cpu_model);
-if (cpu == NULL) {
-error_report("Unable to find PowerPC CPU definition");
-exit(1);
+
+if (smc->dr_cpu_enabled) {
+for (i = 0; i < spapr_smp_cores; i++) {
+core = object_new(TYPE_POWERPC_CPU_CORE);
+object_property_set_bool(core, true, "realized", _abort);
+}
+} else {
+for (i = 0; i < smp_cpus; i++) {
+cpu = cpu_ppc_init(machine->cpu_model);
+if (cpu == NULL) {
+error_report("Unable to find PowerPC CPU definition");
+exit(1);
+}
+object_property_set_bool(OBJECT(cpu), true, "realized",
+ _abort);
 }
-spapr_cpu_init(spapr, cpu, _fatal);
 }
 
 if (kvm_enabled()) {
@@ -2245,10 +2269,92 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void *spapr_populate_hotplug_cpu_dt(DeviceState *dev, CPUState *cs,
+   int *fdt_offset,
+   sPAPRMachineState *spapr)
+{
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+DeviceClass *dc = DEVICE_GET_CLASS(cs);
+int id = ppc_get_vcpu_dt_id(cpu);
+void *fdt;
+int offset, fdt_size;
+char *nodename;
+
+fdt = create_device_tree(_size);
+nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
+offset = fdt_add_subnode(fdt, 0, nodename);
+
+spapr_populate_cpu_dt(cs, fdt, offset, spapr);
+g_free(nodename);
+
+*fdt_offset = offset;
+return fdt;
+}
+
+static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+Error **errp)
+{
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(qdev_get_machine());
+sPAPRMachineState *ms = SPAPR_MACHINE(qdev_get_machine());
+PowerPCCPUCore *core = POWERPC_CPU_CORE(OBJECT(dev));
+PowerPCCPU *cpu = core->thread0;
+CPUState *cs = CPU(cpu);
+int id = ppc_get_vcpu_dt_id(cpu);
+sPAPRDRConnector *drc =
+spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
+sPAPRDRConnectorClass *drck;
+Error *local_err = NULL;
+void *fdt = NULL;
+int fdt_offset = 0;
+
+if (!smc->dr_cpu_enabled) {
+/*
+ * This is a cold plugged CPU core but the machine doesn't support
+ * DR. So skip the hotplug path ensuring that the core is brought
+ * up online with out an associated DR connector.
+ */
+return;
+}
+
+g_assert(drc);
+

[Qemu-devel] [PATCH v7 05/13] cpu: Reclaim vCPU objects

2016-01-27 Thread Bharata B Rao
From: Gu Zheng 

In order to deal well with the kvm vcpus (which can not be removed without any
protection), we do not close KVM vcpu fd, just record and mark it as stopped
into a list, so that we can reuse it for the appending cpu hot-add request if
possible. It is also the approach that kvm guys suggested:
https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html

Signed-off-by: Chen Fan 
Signed-off-by: Gu Zheng 
Signed-off-by: Zhu Guihua 
Signed-off-by: Bharata B Rao 
   [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
  isn't needed as it is done from cpu_exec_exit()
- Use iothread mutex instead of global mutex during
  destroy
- Don't cleanup vCPU object from vCPU thread context
  but leave it to the callers (device_add/device_del)]
Reviewed-by: David Gibson 
---
 cpus.c   | 38 +++
 include/qom/cpu.h| 10 +
 include/sysemu/kvm.h |  1 +
 kvm-all.c| 57 +++-
 kvm-stub.c   |  5 +
 5 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/cpus.c b/cpus.c
index 1e97cc4..c5631f0 100644
--- a/cpus.c
+++ b/cpus.c
@@ -953,6 +953,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void 
*data), void *data)
 qemu_cpu_kick(cpu);
 }
 
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+if (kvm_destroy_vcpu(cpu) < 0) {
+error_report("kvm_destroy_vcpu failed");
+exit(EXIT_FAILURE);
+}
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
 struct qemu_work_item *wi;
@@ -1053,6 +1065,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
 }
 }
 qemu_kvm_wait_io_event(cpu);
+if (cpu->exit && !cpu_can_run(cpu)) {
+qemu_kvm_destroy_vcpu(cpu);
+qemu_mutex_unlock_iothread();
+return NULL;
+}
 }
 
 return NULL;
@@ -1108,6 +1125,7 @@ static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
 CPUState *cpu = arg;
+CPUState *remove_cpu = NULL;
 
 rcu_register_thread();
 
@@ -1145,6 +1163,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
 }
 }
 qemu_tcg_wait_io_event(QTAILQ_FIRST());
+CPU_FOREACH(cpu) {
+if (cpu->exit && !cpu_can_run(cpu)) {
+remove_cpu = cpu;
+break;
+}
+}
+if (remove_cpu) {
+qemu_tcg_destroy_vcpu(remove_cpu);
+remove_cpu = NULL;
+}
 }
 
 return NULL;
@@ -1301,6 +1329,13 @@ void resume_all_vcpus(void)
 }
 }
 
+void cpu_remove(CPUState *cpu)
+{
+cpu->stop = true;
+cpu->exit = true;
+qemu_cpu_kick(cpu);
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16
 
@@ -1517,6 +1552,9 @@ static void tcg_exec_all(void)
 break;
 }
 } else if (cpu->stop || cpu->stopped) {
+if (cpu->exit) {
+next_cpu = CPU_NEXT(cpu);
+}
 break;
 }
 }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 2e5229d..32a2c71 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -232,6 +232,7 @@ struct kvm_run;
  * @halted: Nonzero if the CPU is in suspended state.
  * @stop: Indicates a pending stop request.
  * @stopped: Indicates the CPU has been artificially stopped.
+ * @exit: Indicates the CPU has exited due to an unplug operation.
  * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *   CPU and return to its top level loop.
@@ -284,6 +285,7 @@ struct CPUState {
 bool created;
 bool stop;
 bool stopped;
+bool exit;
 bool crash_occurred;
 bool exit_request;
 uint32_t interrupt_request;
@@ -751,6 +753,14 @@ void cpu_exit(CPUState *cpu);
 void cpu_resume(CPUState *cpu);
 
 /**
+ * cpu_remove:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed.
+ */
+void cpu_remove(CPUState *cpu);
+
+/**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
  *
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 7741f91..7324fa9 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -218,6 +218,7 @@ int kvm_has_intx_set_mask(void);
 
 int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
+int kvm_destroy_vcpu(CPUState *cpu);
 
 #ifdef NEED_CPU_H
 
diff --git a/kvm-all.c b/kvm-all.c
index 9148889..699c1ce 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -62,6 +62,12 @@
 
 #define KVM_MSI_HASHTAB_SIZE256
 
+struct KVMParkedVcpu {
+unsigned long vcpu_id;
+int 

[Qemu-devel] [PATCH v7 13/13] hmp: Add "info ppc-cpu-cores" command

2016-01-27 Thread Bharata B Rao
This is the hmp equivalent of "query ppc-cpu-cores"

Signed-off-by: Bharata B Rao 
---
 hmp-commands-info.hx | 16 
 hmp.c| 31 +++
 hmp.h|  1 +
 3 files changed, 48 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 9b71351..cd9a42e 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -786,6 +786,22 @@ STEXI
 Display the value of a storage key (s390 only)
 ETEXI
 
+#if defined(TARGET_PPC64)
+{
+.name   = "ppc-cpu-cores",
+.args_type  = "",
+.params = "",
+.help   = "show PowerPC CPU core devices",
+.mhandler.cmd = hmp_info_ppc_cpu_cores,
+},
+#endif
+
+STEXI
+@item info ppc-cpu-cores
+@findex ppc-cpu-cores
+Show PowerPC CPU core devices.
+ETEXI
+
 STEXI
 @end table
 ETEXI
diff --git a/hmp.c b/hmp.c
index 54f2620..ae75aa1 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2375,3 +2375,34 @@ void hmp_rocker_of_dpa_groups(Monitor *mon, const QDict 
*qdict)
 
 qapi_free_RockerOfDpaGroupList(list);
 }
+
+void hmp_info_ppc_cpu_cores(Monitor *mon, const QDict *qdict)
+{
+Error *err = NULL;
+PPCCPUCoreList *ppc_cpu_core_list = qmp_query_ppc_cpu_cores();
+PPCCPUCoreList *s = ppc_cpu_core_list;
+CpuInfoList *thread;
+
+while (s) {
+monitor_printf(mon, "PowerPC CPU device: \"%s\"\n",
+   s->value->id ? s->value->id : "");
+monitor_printf(mon, "  hotplugged: %s\n",
+   s->value->hotplugged ? "true" : "false");
+monitor_printf(mon, "  hotpluggable: %s\n",
+   s->value->hotpluggable ? "true" : "false");
+monitor_printf(mon, "  Threads:\n");
+for (thread = s->value->threads; thread; thread = thread->next) {
+monitor_printf(mon, "CPU #%" PRId64 ":", thread->value->CPU);
+monitor_printf(mon, " nip=0x%016" PRIx64,
+   thread->value->u.ppc->nip);
+if (thread->value->halted) {
+monitor_printf(mon, " (halted)");
+}
+monitor_printf(mon, " thread_id=%" PRId64 "\n",
+   thread->value->thread_id);
+}
+s = s->next;
+}
+
+qapi_free_PPCCPUCoreList(ppc_cpu_core_list);
+}
diff --git a/hmp.h b/hmp.h
index a8c5b5a..a31e3d2 100644
--- a/hmp.h
+++ b/hmp.h
@@ -131,5 +131,6 @@ void hmp_rocker(Monitor *mon, const QDict *qdict);
 void hmp_rocker_ports(Monitor *mon, const QDict *qdict);
 void hmp_rocker_of_dpa_flows(Monitor *mon, const QDict *qdict);
 void hmp_rocker_of_dpa_groups(Monitor *mon, const QDict *qdict);
+void hmp_info_ppc_cpu_cores(Monitor *mon, const QDict *qdict);
 
 #endif
-- 
2.1.0




[Qemu-devel] [PATCH v7 11/13] spapr: CPU hot unplug support

2016-01-27 Thread Bharata B Rao
Remove the CPU core device by removing the underlying CPU thread devices.
Hot removal of CPU for sPAPR guests is supported by sending the hot unplug
notification to the guest via EPOW interrupt. Release the vCPU object
after CPU hot unplug so that vCPU fd can be parked and reused.

Signed-off-by: Bharata B Rao 
---
 hw/ppc/spapr.c | 90 ++
 include/hw/ppc/spapr.h |  8 +
 2 files changed, 98 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 6ef520d..0a112d8 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2417,11 +2417,101 @@ static void spapr_machine_device_plug(HotplugHandler 
*hotplug_dev,
 }
 }
 
+static void spapr_cpu_destroy(PowerPCCPU *cpu)
+{
+sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+xics_cpu_destroy(spapr->icp, cpu);
+qemu_unregister_reset(spapr_cpu_reset, cpu);
+}
+
+static void spapr_cpu_core_cleanup(struct sPAPRCPUUnplugList *unplug_list)
+{
+sPAPRCPUUnplug *unplug, *next;
+Object *cpu;
+
+QLIST_FOREACH_SAFE(unplug, unplug_list, node, next) {
+cpu = unplug->cpu;
+object_unparent(cpu);
+QLIST_REMOVE(unplug, node);
+g_free(unplug);
+}
+}
+
+static void spapr_add_cpu_to_unplug_list(Object *cpu,
+ struct sPAPRCPUUnplugList 
*unplug_list)
+{
+sPAPRCPUUnplug *unplug = g_malloc(sizeof(*unplug));
+
+unplug->cpu = cpu;
+QLIST_INSERT_HEAD(unplug_list, unplug, node);
+}
+
+static int spapr_cpu_release(Object *obj, void *opaque)
+{
+DeviceState *dev = DEVICE(obj);
+CPUState *cs = CPU(dev);
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+struct sPAPRCPUUnplugList *unplug_list = opaque;
+
+spapr_cpu_destroy(cpu);
+cpu_remove_sync(cs);
+
+/*
+ * We are still walking the core object's children list, and
+ * hence can't cleanup this CPU thread object just yet. Put
+ * it on a list for later removal.
+ */
+spapr_add_cpu_to_unplug_list(obj, unplug_list);
+return 0;
+}
+
+static void spapr_core_release(DeviceState *dev, void *opaque)
+{
+struct sPAPRCPUUnplugList unplug_list;
+
+QLIST_INIT(_list);
+object_child_foreach(OBJECT(dev), spapr_cpu_release, _list);
+spapr_cpu_core_cleanup(_list);
+object_unparent(OBJECT(dev));
+}
+
+static void spapr_core_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+ Error **errp)
+{
+PowerPCCPUCore *core = POWERPC_CPU_CORE(OBJECT(dev));
+PowerPCCPU *cpu = core->thread0;
+int id = ppc_get_vcpu_dt_id(cpu);
+sPAPRDRConnector *drc =
+spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
+sPAPRDRConnectorClass *drck;
+Error *local_err = NULL;
+
+g_assert(drc);
+
+drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+drck->detach(drc, dev, spapr_core_release, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+return;
+}
+
+spapr_hotplug_req_remove_by_index(drc);
+}
+
 static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
   DeviceState *dev, Error **errp)
 {
+sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(qdev_get_machine());
+
 if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) {
 error_setg(errp, "Memory hot unplug not supported by sPAPR");
+} else if (object_dynamic_cast(OBJECT(dev), TYPE_POWERPC_CPU_CORE)) {
+if (!smc->dr_cpu_enabled) {
+error_setg(errp, "CPU hot unplug not supported on this machine");
+return;
+}
+spapr_core_unplug(hotplug_dev, dev, errp);
 }
 }
 
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index a9d98e7..e161f8f 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -630,4 +630,12 @@ int spapr_rng_populate_dt(void *fdt);
  */
 #define SPAPR_LMB_FLAGS_ASSIGNED 0x0008
 
+/* List to store unplugged CPU objects for cleanup during unplug */
+typedef struct sPAPRCPUUnplug {
+Object *cpu;
+QLIST_ENTRY(sPAPRCPUUnplug) node;
+} sPAPRCPUUnplug;
+
+QLIST_HEAD(sPAPRCPUUnplugList, sPAPRCPUUnplug);
+
 #endif /* !defined (__HW_SPAPR_H__) */
-- 
2.1.0




[Qemu-devel] [PATCH v6 0/5] add ACPI node for fw_cfg on pc and arm

2016-01-27 Thread Gabriel L. Somlo
New since v5:

- rebased on top of latest QEMU git master

Thanks,
  --Gabriel

>New since v4:
>
>   - rebased on top of Marc's DMA series
>   - drop machine compat dependency for insertion into x86/ssdt
> (patch 3/5), following agreement between Igor and Eduardo
>   - [mm]io register range now covers DMA register as well, if
> available.
>   - s/bios/firmware in doc file updates
>
>>New since v3:
>>
>>  - rebased to work on top of 87e896ab (introducing pc-*-25 classes),
>>inserting fw_cfg acpi node only for machines >= 2.5.
>>
>>  - reintroduce _STA with value 0x0B (bit 2 for u/i visibility turned
>>off to avoid Windows complaining -- thanks Igor for catching that!)
>>
>>If there's any other feedback besides questions regarding the
>>appropriateness of "QEMU0002" as the value of _HID, please don't hesitate!
>>
>>>New since v2:
>>>
>>> - pc/i386 node in ssdt only on machine types *newer* than 2.4
>>>   (as suggested by Eduardo)
>>>
>>>I appreciate any further comments and reviews. Hopefully we can make
>>>this palatable for upstream, modulo the lingering concerns about whether
>>>"QEMU0002" is ok to use as the value of _HID, which I'll hopefully get
>>>sorted out with the kernel crew...
>>>
New since v1:

- expose control register size (suggested by Marc Marí)

- leaving out _UID and _STA fields (thanks Shannon & Igor)

- using "QEMU0002" as the value of _HID (thanks Michael)

- added documentation blurb to docs/specs/fw_cfg.txt
  (mainly to record usage of the "QEMU0002" string with fw_cfg).

> This series adds a fw_cfg device node to the SSDT (on pc), or to the
> DSDT (on arm).
>
>   - Patch 1/3 moves (and renames) the BIOS_CFG_IOPORT (0x510)
> define from pc.c to pc.h, so that it could be used from
> acpi-build.c in patch 2/3.
> 
>   - Patch 2/3 adds a fw_cfg node to the pc SSDT.
> 
>   - Patch 3/3 adds a fw_cfg node to the arm DSDT.
>
> I made up some names - "FWCF" for the node name, and "FWCF0001"
> for _HID; no idea whether that's appropriate, or how else I should
> figure out what to use instead...
>
> Also, using scope "\\_SB", based on where fw_cfg shows up in the
> output of "info qtree". Again, if that's wrong, please point me in
> the right direction.
>
> Re. 3/3 (also mentioned after the commit blurb in the patch itself),
> I noticed none of the other DSDT entries contain a _STA field, wondering
> why it would (not) make sense to include that, same as on the PC.

Gabriel L. Somlo (5):
  fw_cfg: expose control register size in fw_cfg.h
  pc: fw_cfg: move ioport base constant to pc.h
  acpi: pc: add fw_cfg device node to ssdt
  acpi: arm: add fw_cfg device node to dsdt
  fw_cfg: document ACPI device node information

 docs/specs/fw_cfg.txt |  9 +
 hw/arm/virt-acpi-build.c  | 15 +++
 hw/i386/acpi-build.c  | 29 +
 hw/i386/pc.c  |  5 ++---
 hw/nvram/fw_cfg.c |  4 +++-
 include/hw/i386/pc.h  |  2 ++
 include/hw/nvram/fw_cfg.h |  3 +++
 7 files changed, 63 insertions(+), 4 deletions(-)

-- 
2.4.3




Re: [Qemu-devel] VFIO based vGPU(was Re: [Announcement] 2015-Q3 release of XenGT - a Mediated ...)

2016-01-27 Thread Kirti Wankhede



On 1/28/2016 3:28 AM, Alex Williamson wrote:

On Thu, 2016-01-28 at 02:25 +0530, Kirti Wankhede wrote:


On 1/27/2016 9:30 PM, Alex Williamson wrote:

On Wed, 2016-01-27 at 13:36 +0530, Kirti Wankhede wrote:


On 1/27/2016 1:36 AM, Alex Williamson wrote:

On Tue, 2016-01-26 at 02:20 -0800, Neo Jia wrote:

On Mon, Jan 25, 2016 at 09:45:14PM +, Tian, Kevin wrote:

From: Alex Williamson [mailto:alex.william...@redhat.com]


Hi Alex, Kevin and Jike,

(Seems I shouldn't use attachment, resend it again to the list, patches are
inline at the end)

Thanks for adding me to this technical discussion, a great opportunity
for us to design together which can bring both Intel and NVIDIA vGPU solution to
KVM platform.

Instead of directly jumping to the proposal that we have been working on
recently for NVIDIA vGPU on KVM, I think it is better for me to put out couple
quick comments / thoughts regarding the existing discussions on this thread as
fundamentally I think we are solving the same problem, DMA, interrupt and MMIO.

Then we can look at what we have, hopefully we can reach some consensus soon.


Yes, and since you're creating and destroying the vgpu here, this is
where I'd expect a struct device to be created and added to an IOMMU
group.  The lifecycle management should really include links between
the vGPU and physical GPU, which would be much, much easier to do with
struct devices create here rather than at the point where we start
doing vfio "stuff".


Infact to keep vfio-vgpu to be more generic, vgpu device creation and management
can be centralized and done in vfio-vgpu. That also include adding to IOMMU
group and VFIO group.

Is this really a good idea?  The concept of a vgpu is not unique to
vfio, we want vfio to be a driver for a vgpu, not an integral part of
the lifecycle of a vgpu.  That certainly doesn't exclude adding
infrastructure to make lifecycle management of a vgpu more consistent
between drivers, but it should be done independently of vfio.  I'll go
back to the SR-IOV model, vfio is often used with SR-IOV VFs, but vfio
does not create the VF, that's done in coordination with the PF making
use of some PCI infrastructure for consistency between drivers.

It seems like we need to take more advantage of the class and driver
core support to perhaps setup a vgpu bus and class with vfio-vgpu just
being a driver for those devices.


For device passthrough or SR-IOV model, PCI devices are created by PCI
bus driver and from the probe routine each device is added in vfio group.


An SR-IOV VF is created by the PF driver using standard interfaces
provided by the PCI core.  The IOMMU group for a VF is added by the
IOMMU driver when the device is created on the pci_bus_type.  The probe
routine of the vfio bus driver (vfio-pci) is what adds the device into
the vfio group.


For vgpu, there should be a common module that create vgpu device, say
vgpu module, add vgpu device to an IOMMU group and then add it to vfio
group.  This module can handle management of vgpus. Advantage of keeping
this module a separate module than doing device creation in vendor
modules is to have generic interface for vgpu management, for example,
files /sys/class/vgpu/vgpu_start and  /sys/class/vgpu/vgpu_shudown and
vgpu driver registration interface.


But you're suggesting something very different from the SR-IOV model.
If we wanted to mimic that model, the GPU specific driver should create
the vgpu using services provided by a common interface.  For instance
i915 could call a new vgpu_device_create() which creates the device,
adds it to the vgpu class, etc.  That vgpu device should not be assumed
to be used with vfio though, that should happen via a separate probe
using a vfio-vgpu driver.  It's that vfio bus driver that will add the
device to a vfio group.



In that case vgpu driver should provide a driver registration interface
to register vfio-vgpu driver.

struct vgpu_driver {
const char *name;
int (*probe) (struct vgpu_device *vdev);
void (*remove) (struct vgpu_device *vdev);
}

int vgpu_register_driver(struct vgpu_driver *driver)
{
...
}
EXPORT_SYMBOL(vgpu_register_driver);

int vgpu_unregister_driver(struct vgpu_driver *driver)
{
...
}
EXPORT_SYMBOL(vgpu_unregister_driver);

vfio-vgpu driver registers to vgpu driver. Then from
vgpu_device_create(), after creating the device it calls
vgpu_driver->probe(vgpu_device) and vfio-vgpu driver adds the device to
vfio group.

+--+vgpu_register_driver()+---+

 __init() +->+   |
  |  |   |
  +<-+vgpu.ko|
vfio_vgpu.ko |   probe()/remove()   |   |
  |+-+   +-+

+--+| +---+---+ |
  | ^ |
  

[Qemu-devel] [PATCH v7 00/13] sPAPR CPU hotplug

2016-01-27 Thread Bharata B Rao
Hi,

This is the 7th iteration of patchset that introduces CPU hotplug for
PowerPC sPAPR guests using device_add/device_del commands.

(qemu) device_add powerpc64-cpu-core,id=core1

The main change in this version is about adding "info ppc-cpu-cores"
QMP/HMP support to obtain information about PowerPC CPU cores.

The first 6 patches are generic changes.

1/13  machine: Don't allow CPU toplogies with partially filled cores
2/13  exec: Remove cpu from cpus list during cpu_exec_exit()
3/13  exec: Do vmstate unregistration from cpu_exec_exit()
4/13  cpu: Don't realize CPU from cpu_generic_init()

Above 4 patches can stand on their own and probably can be pushed
ahead of actual hotplug patches when found ready. Let me know if I
should pursue these in a separate patchset.

Out of the above 4, last three (2/13, 3/13, 4/13) are required by
s390 and have been posted in their CPU hotplug patchset.

5/13  cpu: Reclaim vCPU objects

Above patch is needed by x86 as well as s390 and has been posted in their
respective CPU hotplug patchsets.

6/13  cpu: Add a sync version of cpu_remove()

Above patch is needed by s390 and has been posted in their CPU hotplug
patchset.

The remaining patches are ppc/spapr specific. This patchset applies
on top of ppc-for-2.6 branch of David Gibson's tree.

Changes in v7
-
- Added two patches (12/13, 13/13) to obtain information about
  ppc cpu cores via QMP/HMP.
- Don't populate MachineClass::validate_smp_config() for TYPE_MACINE
  so that all archs are not affected and don't have to explicitly
  disable the enforcement of partially filled cores requirement
  as per Eduardo Habkost's suggestion. (01/13)
- Some minor code cleanups in 10/13 as per David Gibson's suggestion.
- Store the first thread representing the core in PowerPCCPUCore so
  that it DRC index of the core can be easily found during DRC
  attach (08/13, 11/13).
- Make unplug_list local to core release routine as per David (11/13).

v6: http://lists.gnu.org/archive/html/qemu-ppc/2016-01/msg00060.html

Bharata B Rao (12):
  machine: Don't allow CPU toplogies with partially filled cores
  exec: Remove cpu from cpus list during cpu_exec_exit()
  exec: Do vmstate unregistration from cpu_exec_exit()
  cpu: Don't realize CPU from cpu_generic_init()
  cpu: Add a sync version of cpu_remove()
  xics,xics_kvm: Handle CPU unplug correctly
  target-ppc: Introduce PowerPC specific CPU core device
  spapr: Enable CPU hotplug for pseries-2.6 and add CPU DRC DT entries
  spapr: CPU hotplug support
  spapr: CPU hot unplug support
  qmp: Add query-ppc-cpu-cores command
  hmp: Add "info ppc-cpu-cores" command

Gu Zheng (1):
  cpu: Reclaim vCPU objects

 cpus.c  |  50 
 exec.c  |  30 +
 hmp-commands-info.hx|  16 +++
 hmp.c   |  31 +
 hmp.h   |   1 +
 hw/core/machine.c   |  23 
 hw/i386/pc.c|   1 +
 hw/i386/pc_piix.c   |   1 +
 hw/i386/pc_q35.c|   1 +
 hw/intc/xics.c  |  14 +++
 hw/intc/xics_kvm.c  |   8 +-
 hw/ppc/Makefile.objs|   1 +
 hw/ppc/cpu-core.c   | 152 +++
 hw/ppc/spapr.c  | 260 ++--
 hw/ppc/spapr_events.c   |   3 +
 hw/ppc/spapr_rtas.c |  24 
 include/hw/boards.h |   4 +
 include/hw/ppc/cpu-core.h   |  33 +
 include/hw/ppc/spapr.h  |   9 ++
 include/hw/ppc/xics.h   |   1 +
 include/qom/cpu.h   |  18 +++
 include/sysemu/kvm.h|   1 +
 kvm-all.c   |  57 -
 kvm-stub.c  |   5 +
 qapi-schema.json|  31 +
 qmp-commands.hx |  51 
 qom/cpu.c   |   6 -
 stubs/Makefile.objs |   1 +
 stubs/qmp_query_ppc_cpu_cores.c |  10 ++
 target-arm/helper.c |  16 ++-
 target-cris/cpu.c   |  16 ++-
 target-lm32/helper.c|  16 ++-
 target-moxie/cpu.c  |  16 ++-
 target-openrisc/cpu.c   |  16 ++-
 target-ppc/translate_init.c |  24 +++-
 target-sh4/cpu.c|  16 ++-
 target-tricore/helper.c |  16 ++-
 target-unicore32/helper.c   |  16 ++-
 vl.c|   5 +
 39 files changed, 973 insertions(+), 27 deletions(-)
 create mode 100644 hw/ppc/cpu-core.c
 create mode 100644 include/hw/ppc/cpu-core.h
 create mode 100644 stubs/qmp_query_ppc_cpu_cores.c

-- 
2.1.0




[Qemu-devel] [PATCH v7 06/13] cpu: Add a sync version of cpu_remove()

2016-01-27 Thread Bharata B Rao
This sync API will be used by the CPU hotplug code to wait for the CPU to
completely get removed before flagging the failure to the device_add
command.

Sync version of this call is needed to correctly recover from CPU
realization failures when ->plug() handler fails.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 cpus.c| 12 
 include/qom/cpu.h |  8 
 2 files changed, 20 insertions(+)

diff --git a/cpus.c b/cpus.c
index c5631f0..2608ef5 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1067,6 +1067,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
 qemu_kvm_wait_io_event(cpu);
 if (cpu->exit && !cpu_can_run(cpu)) {
 qemu_kvm_destroy_vcpu(cpu);
+cpu->created = false;
+qemu_cond_signal(_cpu_cond);
 qemu_mutex_unlock_iothread();
 return NULL;
 }
@@ -1171,6 +1173,8 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
 }
 if (remove_cpu) {
 qemu_tcg_destroy_vcpu(remove_cpu);
+cpu->created = false;
+qemu_cond_signal(_cpu_cond);
 remove_cpu = NULL;
 }
 }
@@ -1336,6 +1340,14 @@ void cpu_remove(CPUState *cpu)
 qemu_cpu_kick(cpu);
 }
 
+void cpu_remove_sync(CPUState *cpu)
+{
+cpu_remove(cpu);
+while (cpu->created) {
+qemu_cond_wait(_cpu_cond, _global_mutex);
+}
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 32a2c71..bed8654 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -760,6 +760,14 @@ void cpu_resume(CPUState *cpu);
  */
 void cpu_remove(CPUState *cpu);
 
+ /**
+ * cpu_remove_sync:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed and waits till it is removed.
+ */
+void cpu_remove_sync(CPUState *cpu);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
-- 
2.1.0




[Qemu-devel] [PATCH v7 03/13] exec: Do vmstate unregistration from cpu_exec_exit()

2016-01-27 Thread Bharata B Rao
cpu_exec_init() does vmstate_register and register_savevm for the CPU device.
These need to be undone from cpu_exec_exit(). These changes are needed to
support CPU hot removal and also to correctly fail hotplug attempts
beyond max_cpus.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 exec.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/exec.c b/exec.c
index c8da9d4..aa41032 100644
--- a/exec.c
+++ b/exec.c
@@ -591,6 +591,8 @@ static int cpu_get_free_index(Error **errp)
 
 void cpu_exec_exit(CPUState *cpu)
 {
+CPUClass *cc = CPU_GET_CLASS(cpu);
+
 if (cpu->cpu_index == -1) {
 /* cpu_index was never allocated by this @cpu or was already freed. */
 return;
@@ -599,6 +601,15 @@ void cpu_exec_exit(CPUState *cpu)
 QTAILQ_REMOVE(, cpu, node);
 bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
 cpu->cpu_index = -1;
+if (cc->vmsd != NULL) {
+vmstate_unregister(NULL, cc->vmsd, cpu);
+}
+#if defined(CPU_SAVE_VERSION)
+unregister_savevm(NULL, "cpu", cpu->env_ptr);
+#endif
+if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+vmstate_unregister(NULL, _cpu_common, cpu);
+}
 }
 #else
 
@@ -615,6 +626,8 @@ static int cpu_get_free_index(Error **errp)
 
 void cpu_exec_exit(CPUState *cpu)
 {
+CPUClass *cc = CPU_GET_CLASS(cpu);
+
 cpu_list_lock();
 if (cpu->cpu_index == -1) {
 cpu_list_unlock();
@@ -624,6 +637,13 @@ void cpu_exec_exit(CPUState *cpu)
 QTAILQ_REMOVE(, cpu, node);
 cpu->cpu_index = -1;
 cpu_list_unlock();
+
+if (cc->vmsd != NULL) {
+vmstate_unregister(NULL, cc->vmsd, cpu);
+}
+if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
+vmstate_unregister(NULL, _cpu_common, cpu);
+}
 }
 #endif
 
-- 
2.1.0




[Qemu-devel] [PATCH v7 02/13] exec: Remove cpu from cpus list during cpu_exec_exit()

2016-01-27 Thread Bharata B Rao
CPUState *cpu gets added to the cpus list during cpu_exec_init(). It
should be removed from cpu_exec_exit().

cpu_exec_init() is called from generic CPU::instance_finalize and some
archs like PowerPC call it from CPU unrealizefn. So ensure that we
dequeue the cpu only once.

Now -1 value for cpu->cpu_index indicates that we have already dequeued
the cpu for CONFIG_USER_ONLY case also.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 exec.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/exec.c b/exec.c
index 7115403..c8da9d4 100644
--- a/exec.c
+++ b/exec.c
@@ -596,6 +596,7 @@ void cpu_exec_exit(CPUState *cpu)
 return;
 }
 
+QTAILQ_REMOVE(, cpu, node);
 bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
 cpu->cpu_index = -1;
 }
@@ -614,6 +615,15 @@ static int cpu_get_free_index(Error **errp)
 
 void cpu_exec_exit(CPUState *cpu)
 {
+cpu_list_lock();
+if (cpu->cpu_index == -1) {
+cpu_list_unlock();
+return;
+}
+
+QTAILQ_REMOVE(, cpu, node);
+cpu->cpu_index = -1;
+cpu_list_unlock();
 }
 #endif
 
-- 
2.1.0




[Qemu-devel] [PATCH v7 12/13] qmp: Add query-ppc-cpu-cores command

2016-01-27 Thread Bharata B Rao
Show the details of PPC CPU cores via a new QMP command.

TODO: update qmp-commands.hx with example

Signed-off-by: Bharata B Rao 
---
 hw/ppc/cpu-core.c   | 77 +
 qapi-schema.json| 31 +
 qmp-commands.hx | 51 +++
 stubs/Makefile.objs |  1 +
 stubs/qmp_query_ppc_cpu_cores.c | 10 ++
 5 files changed, 170 insertions(+)
 create mode 100644 stubs/qmp_query_ppc_cpu_cores.c

diff --git a/hw/ppc/cpu-core.c b/hw/ppc/cpu-core.c
index aa96e79..652a5aa 100644
--- a/hw/ppc/cpu-core.c
+++ b/hw/ppc/cpu-core.c
@@ -9,7 +9,84 @@
 #include "hw/ppc/cpu-core.h"
 #include "hw/boards.h"
 #include 
+#include 
 #include "qemu/error-report.h"
+#include "qmp-commands.h"
+
+/*
+ * QMP: info ppc-cpu-cores
+ */
+static int qmp_ppc_cpu_list(Object *obj, void *opaque)
+{
+CpuInfoList ***prev = opaque;
+
+if (object_dynamic_cast(obj, TYPE_POWERPC_CPU)) {
+CpuInfoList *elem = g_new0(CpuInfoList, 1);
+CpuInfo *s = g_new0(CpuInfo, 1);
+CPUState *cs = CPU(obj);
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+CPUPPCState *env = >env;
+
+cpu_synchronize_state(cs);
+s->arch = CPU_INFO_ARCH_PPC;
+s->current = (cs == first_cpu);
+s->CPU = cs->cpu_index;
+s->qom_path = object_get_canonical_path(obj);
+s->halted = cs->halted;
+s->thread_id = cs->thread_id;
+s->u.ppc = g_new0(CpuInfoPPC, 1);
+s->u.ppc->nip = env->nip;
+
+elem->value = s;
+elem->next = NULL;
+**prev = elem;
+*prev = >next;
+}
+object_child_foreach(obj, qmp_ppc_cpu_list, opaque);
+return 0;
+}
+
+static int qmp_ppc_cpu_core_list(Object *obj, void *opaque)
+{
+PPCCPUCoreList ***prev = opaque;
+
+if (object_dynamic_cast(obj, TYPE_POWERPC_CPU_CORE)) {
+DeviceClass *dc = DEVICE_GET_CLASS(obj);
+DeviceState *dev = DEVICE(obj);
+
+if (dev->realized) {
+PPCCPUCoreList *elem = g_new0(PPCCPUCoreList, 1);
+PPCCPUCore *s = g_new0(PPCCPUCore, 1);
+CpuInfoList *cpu_head = NULL;
+CpuInfoList **cpu_prev = _head;
+
+if (dev->id) {
+s->has_id = true;
+s->id = g_strdup(dev->id);
+}
+s->hotplugged = dev->hotplugged;
+s->hotpluggable = dc->hotpluggable;
+qmp_ppc_cpu_list(obj, _prev);
+s->threads = cpu_head;
+elem->value = s;
+elem->next = NULL;
+**prev = elem;
+*prev = >next;
+}
+}
+
+object_child_foreach(obj, qmp_ppc_cpu_core_list, opaque);
+return 0;
+}
+
+PPCCPUCoreList *qmp_query_ppc_cpu_cores(Error **errp)
+{
+PPCCPUCoreList *head = NULL;
+PPCCPUCoreList **prev = 
+
+qmp_ppc_cpu_core_list(qdev_get_machine(), );
+return head;
+}
 
 static int ppc_cpu_core_realize_child(Object *child, void *opaque)
 {
diff --git a/qapi-schema.json b/qapi-schema.json
index 8d04897..0902697 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4083,3 +4083,34 @@
 ##
 { 'enum': 'ReplayMode',
   'data': [ 'none', 'record', 'play' ] }
+
+##
+# @PPCCPUCore:
+#
+# Information about PPC CPU core devices
+#
+# @hotplugged: true if device was hotplugged
+#
+# @hotpluggable: true if device if could be added/removed while machine is 
running
+#
+# Since: 2.6
+##
+
+{ 'struct': 'PPCCPUCore',
+  'data': { '*id': 'str',
+'hotplugged': 'bool',
+'hotpluggable': 'bool',
+'threads' : ['CpuInfo']
+  }
+}
+
+##
+# @query-ppc-cpu-core:
+#
+# Returns information for all PPC CPU core devices
+#
+# Returns: a list of @PPCCPUCore.
+#
+# Since: 2.6
+##
+{ 'command': 'query-ppc-cpu-cores', 'returns': ['PPCCPUCore'] }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index db072a6..77cda3c 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -4795,3 +4795,54 @@ Example:
  {"type": 0, "out-pport": 0, "pport": 0, "vlan-id": 3840,
   "pop-vlan": 1, "id": 251658240}
]}
+
+EQMP
+
+#if defined TARGET_PPC64
+{
+.name   = "query-ppc-cpu-cores",
+.args_type  = "",
+.mhandler.cmd_new = qmp_marshal_query_ppc_cpu_cores,
+},
+#endif
+
+SQMP
+@query-ppc-cpu-cores
+
+
+Show PowerPC CPU core devices information.
+
+Example:
+-> { "execute": "query-ppc-cpu-cores" }
+<- {"return": [{"threads": [
+ {"arch": "ppc",
+  "current": false,
+  "CPU": 16,
+  "nip": -4611686018426944644,
+  "qom_path": "/machine/peripheral/core2/thread[0]",
+  "halted": false,
+  "thread_id": 32636},
+ {"arch": "ppc",
+  "current": false",
+  "CPU": 17,
+  "nip": -4611686018426944644,
+  

[Qemu-devel] [PATCH v7 08/13] target-ppc: Introduce PowerPC specific CPU core device

2016-01-27 Thread Bharata B Rao
CPU core device is a container of CPU thread devices.  CPU hotplug is
performed at the granularity of CPU core device. When hotplugged, CPU core
creates CPU thread devices.

Signed-off-by: Bharata B Rao 
---
 hw/ppc/Makefile.objs  |  1 +
 hw/ppc/cpu-core.c | 75 +++
 include/hw/ppc/cpu-core.h | 33 +
 3 files changed, 109 insertions(+)
 create mode 100644 hw/ppc/cpu-core.c
 create mode 100644 include/hw/ppc/cpu-core.h

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c1ffc77..a6b7cfb 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -21,3 +21,4 @@ obj-$(CONFIG_E500) += e500.o mpc8544ds.o e500plat.o
 obj-$(CONFIG_E500) += mpc8544_guts.o ppce500_spin.o
 # PowerPC 440 Xilinx ML507 reference board.
 obj-$(CONFIG_XILINX) += virtex_ml507.o
+obj-y += cpu-core.o
diff --git a/hw/ppc/cpu-core.c b/hw/ppc/cpu-core.c
new file mode 100644
index 000..aa96e79
--- /dev/null
+++ b/hw/ppc/cpu-core.c
@@ -0,0 +1,75 @@
+/*
+ * PowerPC CPU core device, acts as container of CPU thread devices.
+ *
+ * Copyright (C) 2016 Bharata B Rao 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "hw/ppc/cpu-core.h"
+#include "hw/boards.h"
+#include 
+#include "qemu/error-report.h"
+
+static int ppc_cpu_core_realize_child(Object *child, void *opaque)
+{
+Error **errp = opaque;
+
+object_property_set_bool(child, true, "realized", errp);
+if (*errp) {
+return 1;
+}
+
+return 0;
+}
+
+static void ppc_cpu_core_realize(DeviceState *dev, Error **errp)
+{
+object_child_foreach(OBJECT(dev), ppc_cpu_core_realize_child, errp);
+}
+
+static void ppc_cpu_core_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+
+dc->realize = ppc_cpu_core_realize;
+dc->desc = "PowerPC CPU core";
+}
+
+static void ppc_cpu_core_instance_init(Object *obj)
+{
+int i;
+CPUState *cpu;
+MachineState *machine = MACHINE(qdev_get_machine());
+PowerPCCPUCore *core = POWERPC_CPU_CORE(obj);
+
+/* Create as many CPU threads as specified in the topology */
+for (i = 0; i < smp_threads; i++) {
+cpu = cpu_generic_init(TYPE_POWERPC_CPU, machine->cpu_model);
+if (!cpu) {
+error_report("Unable to find CPU definition: %s",
+  machine->cpu_model);
+exit(EXIT_FAILURE);
+}
+object_property_add_child(obj, "thread[*]", OBJECT(cpu), _abort);
+object_unref(OBJECT(cpu));
+if (!i) {
+core->thread0 = POWERPC_CPU(cpu);
+}
+}
+}
+
+static const TypeInfo ppc_cpu_core_type_info = {
+.name = TYPE_POWERPC_CPU_CORE,
+.parent = TYPE_DEVICE,
+.class_init = ppc_cpu_core_class_init,
+.instance_init = ppc_cpu_core_instance_init,
+.instance_size = sizeof(PowerPCCPUCore),
+};
+
+static void cpu_core_register_types(void)
+{
+type_register_static(_cpu_core_type_info);
+}
+
+type_init(cpu_core_register_types)
diff --git a/include/hw/ppc/cpu-core.h b/include/hw/ppc/cpu-core.h
new file mode 100644
index 000..ff2ebc2
--- /dev/null
+++ b/include/hw/ppc/cpu-core.h
@@ -0,0 +1,33 @@
+/*
+ * CPU core device.
+ *
+ * Copyright (C) 2016 Bharata B Rao 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef HW_PPC_CPU_CORE_H
+#define HW_PPC_CPU_CORE_H
+
+#include "hw/qdev.h"
+
+#ifdef TARGET_PPC64
+#define TYPE_POWERPC_CPU_CORE "powerpc64-cpu-core"
+#elif defined(TARGET_PPCEMB)
+#define TYPE_POWERPC_CPU_CORE "embedded-powerpc-cpu-core"
+#else
+#define TYPE_POWERPC_CPU_CORE "powerpc-cpu-core"
+#endif
+
+#define POWERPC_CPU_CORE(obj) \
+OBJECT_CHECK(PowerPCCPUCore, (obj), TYPE_POWERPC_CPU_CORE)
+
+typedef struct PowerPCCPUCore {
+/*< private >*/
+DeviceState parent_obj;
+/*< public >*/
+
+PowerPCCPU *thread0;
+} PowerPCCPUCore;
+
+#endif
-- 
2.1.0




Re: [Qemu-devel] [PATCH v14 3/8] Backup: clear all bitmap when doing block checkpoint

2016-01-27 Thread Changlong Xie

On 01/28/2016 12:05 AM, Stefan Hajnoczi wrote:

On Wed, Jan 13, 2016 at 05:18:27PM +0800, Changlong Xie wrote:

diff --git a/blockjob.c b/blockjob.c
index 80adb9d..0c8edfe 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -533,3 +533,14 @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job)
  QLIST_INSERT_HEAD(>jobs, job, txn_list);
  block_job_txn_ref(txn);
  }
+
+void block_job_do_checkpoint(BlockJob *job, Error **errp)
+{
+if (!job->driver->do_checkpoint) {
+error_setg(errp, "The job %s doesn't support block checkpoint",
+   BlockJobType_lookup[job->driver->job_type]);
+return;
+}
+
+job->driver->do_checkpoint(job, errp);
+}
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index d84ccd8..abdba7c 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -70,6 +70,9 @@ typedef struct BlockJobDriver {
   * never both.
   */
  void (*abort)(BlockJob *job);
+
+/** Optional callback for job types that support checkpoint. */
+void (*do_checkpoint)(BlockJob *job, Error **errp);


The COLO/replication-specific callbacks have been moved out of
BlockDriver into their own replication struct.  Similar reasoning
applies to BlockJobDriver:

The do_checkpoint() callback is only implemented by one type of job and
its purpose is related to COLO rather than jobs.  This is a strong
indication that this shouldn't be part of the generic BlockJobDriver
struct.

Please drop changes to the generic blockjob interface.  Instead, make
backup_do_checkpoint() public and add assert(job->driver->type ==
BLOCK_JOB_TYPE_BACKUP) into the function.

Then the replication filter can call backup_do_checkpoint() directly.



Will fix it in next version.

Thanks
-Xie


Stefan







Re: [Qemu-devel] [PATCH v14 4/8] Allow creating backup jobs when opening BDS

2016-01-27 Thread Changlong Xie

On 01/27/2016 10:04 PM, Stefan Hajnoczi wrote:

On Wed, Jan 13, 2016 at 05:18:28PM +0800, Changlong Xie wrote:

From: Wen Congyang 

When opening BDS, we need to create backup jobs for
image-fleecing.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Jeff Cody 
---
  block/Makefile.objs | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 58ef2ef..fa05f37 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -22,10 +22,10 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
  block-obj-$(CONFIG_LIBSSH2) += ssh.o
  block-obj-y += accounting.o
  block-obj-y += write-threshold.o
+block-obj-y += backup.o

  common-obj-y += stream.o
  common-obj-y += commit.o
-common-obj-y += backup.o

  iscsi.o-cflags := $(LIBISCSI_CFLAGS)
  iscsi.o-libs   := $(LIBISCSI_LIBS)


The commit message and description seem outdated.  I guess the purpose
of this patch is to link the backup block job into all programs that use
the block layer because you want to add a dependency on the it from core
code.



Will update it in next version.

Thanks
-Xie





[Qemu-devel] [PATCH v7 09/13] spapr: Enable CPU hotplug for pseries-2.6 and add CPU DRC DT entries

2016-01-27 Thread Bharata B Rao
Start supporting CPU hotplug from pseries-2.6 onwards. Add CPU
DRC (Dynamic Resource Connector) device tree entries.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 hw/ppc/spapr.c | 23 +++
 include/hw/ppc/spapr.h |  1 +
 2 files changed, 24 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 6ac9f06..eeea411 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -985,6 +985,16 @@ static void spapr_finalize_fdt(sPAPRMachineState *spapr,
 _FDT(spapr_drc_populate_dt(fdt, 0, NULL, SPAPR_DR_CONNECTOR_TYPE_LMB));
 }
 
+if (smc->dr_cpu_enabled) {
+int offset = fdt_path_offset(fdt, "/cpus");
+ret = spapr_drc_populate_dt(fdt, offset, NULL,
+SPAPR_DR_CONNECTOR_TYPE_CPU);
+if (ret < 0) {
+fprintf(stderr, "Couldn't set up CPU DR device tree properties\n");
+exit(1);
+}
+}
+
 _FDT((fdt_pack(fdt)));
 
 if (fdt_totalsize(fdt) > FDT_MAX_SIZE) {
@@ -1752,6 +1762,8 @@ static void ppc_spapr_init(MachineState *machine)
 long load_limit, fw_size;
 bool kernel_le = false;
 char *filename;
+int smt = kvmppc_smt_threads();
+int smp_max_cores = max_cpus / smp_threads;
 
 msi_supported = true;
 
@@ -1818,6 +1830,15 @@ static void ppc_spapr_init(MachineState *machine)
 spapr_validate_node_memory(machine, _fatal);
 }
 
+if (smc->dr_cpu_enabled) {
+for (i = 0; i < smp_max_cores; i++) {
+sPAPRDRConnector *drc =
+spapr_dr_connector_new(OBJECT(spapr),
+   SPAPR_DR_CONNECTOR_TYPE_CPU, i * smt);
+qemu_register_reset(spapr_drc_reset, drc);
+}
+}
+
 /* init CPUs */
 if (machine->cpu_model == NULL) {
 machine->cpu_model = kvm_enabled() ? "host" : "POWER7";
@@ -2323,6 +2344,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
 
 smc->dr_lmb_enabled = true;
+smc->dr_cpu_enabled = true;
 fwc->get_dev_path = spapr_get_fw_dev_path;
 nc->nmi_monitor_handler = spapr_nmi;
 }
@@ -2402,6 +2424,7 @@ static void spapr_machine_2_5_class_options(MachineClass 
*mc)
 
 spapr_machine_2_6_class_options(mc);
 smc->use_ohci_by_default = true;
+smc->dr_cpu_enabled = false;
 SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_5);
 mc->validate_smp_config = NULL;
 }
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 1f9e722..a9d98e7 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -36,6 +36,7 @@ struct sPAPRMachineClass {
 
 /*< public >*/
 bool dr_lmb_enabled;   /* enable dynamic-reconfig/hotplug of LMBs */
+bool dr_cpu_enabled;   /* enable dynamic-reconfig/hotplug of CPUs */
 bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
 };
 
-- 
2.1.0




[Qemu-devel] [PATCH v7 04/13] cpu: Don't realize CPU from cpu_generic_init()

2016-01-27 Thread Bharata B Rao
Don't do CPU realization from cpu_generic_init(). With this
cpu_generic_init() will be used to just create CPU threads and they
should be realized separately from realizefn call.

Convert the existing callers to do explicit realization.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
Reviewed-by: Eduardo Habkost 
---
 qom/cpu.c   |  6 --
 target-arm/helper.c | 16 +++-
 target-cris/cpu.c   | 16 +++-
 target-lm32/helper.c| 16 +++-
 target-moxie/cpu.c  | 16 +++-
 target-openrisc/cpu.c   | 16 +++-
 target-ppc/translate_init.c | 16 +++-
 target-sh4/cpu.c| 16 +++-
 target-tricore/helper.c | 16 +++-
 target-unicore32/helper.c   | 16 +++-
 10 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/qom/cpu.c b/qom/cpu.c
index 8f537a4..01fd776 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -63,13 +63,7 @@ CPUState *cpu_generic_init(const char *typename, const char 
*cpu_model)
 featurestr = strtok(NULL, ",");
 cc->parse_features(cpu, featurestr, );
 g_free(str);
-if (err != NULL) {
-goto out;
-}
-
-object_property_set_bool(OBJECT(cpu), true, "realized", );
 
-out:
 if (err != NULL) {
 error_report_err(err);
 object_unref(OBJECT(cpu));
diff --git a/target-arm/helper.c b/target-arm/helper.c
index ae02486..4a46cdb 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -4564,7 +4564,21 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 
 ARMCPU *cpu_arm_init(const char *cpu_model)
 {
-return ARM_CPU(cpu_generic_init(TYPE_ARM_CPU, cpu_model));
+CPUState *cpu = cpu_generic_init(TYPE_ARM_CPU, cpu_model);
+Error *err = NULL;
+
+if (!cpu) {
+return NULL;
+}
+
+object_property_set_bool(OBJECT(cpu), true, "realized", );
+if (err != NULL) {
+error_report_err(err);
+object_unref(OBJECT(cpu));
+return NULL;
+} else {
+return ARM_CPU(cpu);
+}
 }
 
 void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
diff --git a/target-cris/cpu.c b/target-cris/cpu.c
index 8eaf5a5..d2c0822 100644
--- a/target-cris/cpu.c
+++ b/target-cris/cpu.c
@@ -89,7 +89,21 @@ static ObjectClass *cris_cpu_class_by_name(const char 
*cpu_model)
 
 CRISCPU *cpu_cris_init(const char *cpu_model)
 {
-return CRIS_CPU(cpu_generic_init(TYPE_CRIS_CPU, cpu_model));
+CPUState *cpu = cpu_generic_init(TYPE_CRIS_CPU, cpu_model);
+Error *err = NULL;
+
+if (!cpu) {
+return NULL;
+}
+
+object_property_set_bool(OBJECT(cpu), true, "realized", );
+if (err != NULL) {
+error_report_err(err);
+object_unref(OBJECT(cpu));
+return NULL;
+} else {
+return CRIS_CPU(cpu);
+}
 }
 
 /* Sort alphabetically by VR. */
diff --git a/target-lm32/helper.c b/target-lm32/helper.c
index e26c133..49ac960 100644
--- a/target-lm32/helper.c
+++ b/target-lm32/helper.c
@@ -218,7 +218,21 @@ bool lm32_cpu_exec_interrupt(CPUState *cs, int 
interrupt_request)
 
 LM32CPU *cpu_lm32_init(const char *cpu_model)
 {
-return LM32_CPU(cpu_generic_init(TYPE_LM32_CPU, cpu_model));
+CPUState *cpu = cpu_generic_init(TYPE_LM32_CPU, cpu_model);
+Error *err = NULL;
+
+if (!cpu) {
+return NULL;
+}
+
+object_property_set_bool(OBJECT(cpu), true, "realized", );
+if (err != NULL) {
+error_report_err(err);
+object_unref(OBJECT(cpu));
+return NULL;
+} else {
+return LM32_CPU(cpu);
+}
 }
 
 /* Some soc ignores the MSB on the address bus. Thus creating a shadow memory
diff --git a/target-moxie/cpu.c b/target-moxie/cpu.c
index 0c60c65..5989fa6 100644
--- a/target-moxie/cpu.c
+++ b/target-moxie/cpu.c
@@ -152,7 +152,21 @@ static const MoxieCPUInfo moxie_cpus[] = {
 
 MoxieCPU *cpu_moxie_init(const char *cpu_model)
 {
-return MOXIE_CPU(cpu_generic_init(TYPE_MOXIE_CPU, cpu_model));
+CPUState *cpu = cpu_generic_init(TYPE_MOXIE_CPU, cpu_model);
+Error *err = NULL;
+
+if (!cpu) {
+return NULL;
+}
+
+object_property_set_bool(OBJECT(cpu), true, "realized", );
+if (err != NULL) {
+error_report_err(err);
+object_unref(OBJECT(cpu));
+return NULL;
+} else {
+return MOXIE_CPU(cpu);
+}
 }
 
 static void cpu_register(const MoxieCPUInfo *info)
diff --git a/target-openrisc/cpu.c b/target-openrisc/cpu.c
index cc5e2d1..873eafb 100644
--- a/target-openrisc/cpu.c
+++ b/target-openrisc/cpu.c
@@ -222,7 +222,21 @@ static void openrisc_cpu_register_types(void)
 
 OpenRISCCPU *cpu_openrisc_init(const char *cpu_model)
 {
-return OPENRISC_CPU(cpu_generic_init(TYPE_OPENRISC_CPU, cpu_model));
+CPUState *cpu = cpu_generic_init(TYPE_OPENRISC_CPU, cpu_model);
+Error *err = NULL;
+
+if (!cpu) {
+return NULL;
+}
+
+   

[Qemu-devel] [PATCH v7 07/13] xics, xics_kvm: Handle CPU unplug correctly

2016-01-27 Thread Bharata B Rao
XICS is setup for each CPU during initialization. Provide a routine
to undo the same when CPU is unplugged. While here, move ss->cs management
into xics from xics_kvm since there is nothing KVM specific in it.
Also ensure xics reset doesn't set irq for CPUs that are already unplugged.

This allows reboot of a VM that has undergone CPU hotplug and unplug
to work correctly.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 hw/intc/xics.c| 14 ++
 hw/intc/xics_kvm.c|  8 
 include/hw/ppc/xics.h |  1 +
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/hw/intc/xics.c b/hw/intc/xics.c
index 9ff5796..0782e86 100644
--- a/hw/intc/xics.c
+++ b/hw/intc/xics.c
@@ -44,6 +44,18 @@ static int get_cpu_index_by_dt_id(int cpu_dt_id)
 return -1;
 }
 
+void xics_cpu_destroy(XICSState *icp, PowerPCCPU *cpu)
+{
+CPUState *cs = CPU(cpu);
+ICPState *ss = >ss[cs->cpu_index];
+
+assert(cs->cpu_index < icp->nr_servers);
+assert(cs == ss->cs);
+
+ss->output = NULL;
+ss->cs = NULL;
+}
+
 void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
 {
 CPUState *cs = CPU(cpu);
@@ -53,6 +65,8 @@ void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu)
 
 assert(cs->cpu_index < icp->nr_servers);
 
+ss->cs = cs;
+
 if (info->cpu_setup) {
 info->cpu_setup(icp, cpu);
 }
diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
index d58729c..049d4e2 100644
--- a/hw/intc/xics_kvm.c
+++ b/hw/intc/xics_kvm.c
@@ -109,8 +109,10 @@ static void icp_kvm_reset(DeviceState *dev)
 icp->pending_priority = 0xff;
 icp->mfrr = 0xff;
 
-/* Make all outputs are deasserted */
-qemu_set_irq(icp->output, 0);
+/* Make all outputs as deasserted only if the CPU thread is in use */
+if (icp->output) {
+qemu_set_irq(icp->output, 0);
+}
 
 icp_set_kvm_state(icp, 1);
 }
@@ -343,8 +345,6 @@ static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU 
*cpu)
 if (icpkvm->kernel_xics_fd != -1) {
 int ret;
 
-ss->cs = cs;
-
 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_IRQ_XICS, 0,
   icpkvm->kernel_xics_fd, 
kvm_arch_vcpu_id(cs));
 if (ret < 0) {
diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
index 355a966..640162f 100644
--- a/include/hw/ppc/xics.h
+++ b/include/hw/ppc/xics.h
@@ -166,5 +166,6 @@ int xics_alloc_block(XICSState *icp, int src, int num, bool 
lsi, bool align);
 void xics_free(XICSState *icp, int irq, int num);
 
 void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu);
+void xics_cpu_destroy(XICSState *icp, PowerPCCPU *cpu);
 
 #endif /* __XICS_H__ */
-- 
2.1.0




[Qemu-devel] [PATCH v7 01/13] machine: Don't allow CPU toplogies with partially filled cores

2016-01-27 Thread Bharata B Rao
Prevent guests from booting with CPU topologies that have partially
filled CPU cores or can result in partially filled CPU cores after
CPU hotplug like

-smp 15,sockets=1,cores=4,threads=4,maxcpus=16 or
-smp 15,sockets=1,cores=4,threads=4,maxcpus=17.

This is enforced by introducing MachineClass::validate_smp_config()
that gets called from generic SMP parsing code. Machine type versions
that want to enforce this can define this to the generic version
provided.

Only sPAPR and PC machine types starting from version 2.6 enforce this in
this patch.

Signed-off-by: Bharata B Rao 
---
 hw/core/machine.c   | 23 +++
 hw/i386/pc.c|  1 +
 hw/i386/pc_piix.c   |  1 +
 hw/i386/pc_q35.c|  1 +
 hw/ppc/spapr.c  |  2 ++
 include/hw/boards.h |  4 
 vl.c|  5 +
 7 files changed, 37 insertions(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index c46ddc7..4505995 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -336,6 +336,29 @@ static void machine_init_notify(Notifier *notifier, void 
*data)
 foreach_dynamic_sysbus_device(error_on_sysbus_device, NULL);
 }
 
+/*
+ * Machine types that want to prevent starting of guests with
+ * partially filled CPU cores can use this routine as their
+ * MachineClass:validate_smp_config().
+ */
+void validate_smp_config_generic(int smp_cpus, int max_cpus,
+ int smp_threads, Error **errp)
+{
+if (smp_cpus % smp_threads) {
+error_setg(errp, "cpu topology: "
+   "smp_cpus (%u) should be multiple of threads (%u) ",
+   smp_cpus, smp_threads);
+return;
+}
+
+if (max_cpus % smp_threads) {
+error_setg(errp, "cpu topology: "
+   "max_cpus (%u) should be multiple of threads (%u) ",
+   max_cpus, smp_threads);
+return;
+}
+}
+
 static void machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 78cf8fa..a54e0a0 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1971,6 +1971,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 mc->hot_add_cpu = pc_hot_add_cpu;
 mc->max_cpus = 255;
 mc->reset = pc_machine_reset;
+mc->validate_smp_config = validate_smp_config_generic;
 hc->plug = pc_machine_device_plug_cb;
 hc->unplug_request = pc_machine_device_unplug_request_cb;
 hc->unplug = pc_machine_device_unplug_cb;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index bc74557..98b8b69 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -435,6 +435,7 @@ static void pc_i440fx_2_5_machine_options(MachineClass *m)
 pc_i440fx_2_6_machine_options(m);
 m->alias = NULL;
 m->is_default = 0;
+m->validate_smp_config = NULL;
 pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_5);
 }
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 6128b02..c5f4935 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -362,6 +362,7 @@ static void pc_q35_2_5_machine_options(MachineClass *m)
 PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
 pc_q35_2_6_machine_options(m);
 m->alias = NULL;
+m->validate_smp_config = NULL;
 pcmc->save_tsc_khz = false;
 SET_MACHINE_COMPAT(m, PC_COMPAT_2_5);
 }
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index a9c9a95..6ac9f06 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2317,6 +2317,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 mc->has_dynamic_sysbus = true;
 mc->pci_allow_0_address = true;
 mc->get_hotplug_handler = spapr_get_hotpug_handler;
+mc->validate_smp_config = validate_smp_config_generic;
 hc->plug = spapr_machine_device_plug;
 hc->unplug = spapr_machine_device_unplug;
 mc->cpu_index_to_socket_id = spapr_cpu_index_to_socket_id;
@@ -2402,6 +2403,7 @@ static void spapr_machine_2_5_class_options(MachineClass 
*mc)
 spapr_machine_2_6_class_options(mc);
 smc->use_ohci_by_default = true;
 SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_5);
+mc->validate_smp_config = NULL;
 }
 
 DEFINE_SPAPR_MACHINE(2_5, "2.5", false);
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 0f30959..435c339 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -40,6 +40,8 @@ int machine_kvm_shadow_mem(MachineState *machine);
 int machine_phandle_start(MachineState *machine);
 bool machine_dump_guest_core(MachineState *machine);
 bool machine_mem_merge(MachineState *machine);
+void validate_smp_config_generic(int smp_cpus, int max_cpus,
+ int smp_threads, Error **errp);
 
 /**
  * MachineClass:
@@ -99,6 +101,8 @@ struct MachineClass {
 HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
DeviceState *dev);
 unsigned (*cpu_index_to_socket_id)(unsigned cpu_index);
+void (*validate_smp_config)(int smp_cpus, int 

Re: [Qemu-devel] VFIO based vGPU(was Re: [Announcement] 2015-Q3 release of XenGT - a Mediated ...)

2016-01-27 Thread Jike Song
On 01/28/2016 12:19 AM, Alex Williamson wrote:
> On Wed, 2016-01-27 at 13:43 +0800, Jike Song wrote:
{snip}

>> Had a look at eventfd, I would say yes, technically we are able to
>> achieve the goal: introduce a fd, with fop->{read|write} defined in KVM,
>> call into vgpu device-model, also an iodev registered for a MMIO GPA
>> range to invoke the fop->{read|write}.  I just didn't understand why
>> userspace can't register an iodev via API directly.
> 
> Please elaborate on how it would work via iodev.
>

QEMU forwards BAR0 write to the bus driver, in the bus driver, if
found that MEM bit is enabled, register an iodev to KVM: with an
ops:

const struct kvm_io_device_ops trap_mmio_ops = {
.read   = kvmgt_guest_mmio_read,
.write  = kvmgt_guest_mmio_write,
};

I may not be able to illustrated it clearly with descriptions but this
should not be a problem, thanks to your explanation, I can understand
and adopt it for KVMGT.


>> Besides, this doesn't necessarily require another thread, right?
>> I guess it can be within the VCPU thread? 
> 
> I would think so too, the vcpu is blocked on the MMIO access, we should
> be able to service it in that context.  I hope.
> 

Thanks for confirmation.

>> And this brought another question: except the vfio bus drvier and
>> iommu backend (and the page_track ulitiy used for guest memory 
>> write-protection), 
>> is it KVMGT allowed to call into kvm.ko (or modify)? Though we are
>> becoming less and less willing to do that with VFIO, it's still better
>> to know that before going wrong.
> 
> kvm and vfio are separate modules, for the most part, they know nothing
> about each other and have no hard dependencies between them.  We do have
> various accelerations we can use to avoid paths through userspace, but
> these are all via APIs that are agnostic of the party on the other end.
> For example, vfio signals interrups through eventfds and has no concept
> of whether that eventfd terminates in userspace or into an irqfd in KVM.
> vfio supports direct access to device MMIO regions via mmaps, but vfio
> has no idea if that mmap gets directly mapped into a VM address space.
> Even with posted interrupts, we've introduced an irq bypass manager
> allowing interrupt producers and consumers to register independently to
> form a connection without directly knowing anything about the other
> module.  That sort or proper software layering needs to continue.  It
> would be wrong for a vfio bus driver to assume KVM is the user and
> directly call into KVM interfaces.  Thanks,
> 

I understand and agree with your point, it's bad if the bus driver
assume KVM is the user and/or call into KVM interfaces.

However, the vgpu device-model, in intel case also a part of i915 driver,
will always need to call some hypervisor-specific interfaces.
For example, when a guest gfx driver submit GPU commands, the device-model
may want to scan it for security or whatever-else purpose:

- get a GPA (from GPU page tables)
- want to read 16 bytes from that GPA
- call hypervisor-specific read_gpa() method
- for Xen, the GPA belongs to a foreign domain, it must find
  a way to map & read it - beyond our scope here;
- for KVM, the GPA can converted to HVA, copy_from_user (if
  called from vcpu thread) or access_remote_vm (if called from
  other threads);

Please note that this is not from the vfio bus driver, but from the vgpu
device-model; also this is not DMA addr from GPU talbes, but real GPA.


> Alex
> 

--
Thanks,
Jike




[Qemu-devel] [PATCH v6 1/5] fw_cfg: expose control register size in fw_cfg.h

2016-01-27 Thread Gabriel L. Somlo
Expose the size of the control register (FW_CFG_CTL_SIZE) in fw_cfg.h.
Add comment to fw_cfg_io_realize() pointing out that since the
8-bit data register is always subsumed by the 16-bit control
register in the port I/O case, we use the control register width
as the *total* width of the (classic, non-DMA) port I/O region reserved
for the device.

Cc: Marc Marí 
Signed-off-by: Gabriel Somlo 
Reviewed-by: Laszlo Ersek 
Reviewed-by: Marc Marí 
---
 hw/nvram/fw_cfg.c | 4 +++-
 include/hw/nvram/fw_cfg.h | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index a1d650d..06a4ff0 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -31,7 +31,6 @@
 #include "qemu/error-report.h"
 #include "qemu/config-file.h"
 
-#define FW_CFG_CTL_SIZE 2
 #define FW_CFG_NAME "fw_cfg"
 #define FW_CFG_PATH "/machine/" FW_CFG_NAME
 
@@ -881,6 +880,9 @@ static void fw_cfg_io_realize(DeviceState *dev, Error 
**errp)
 FWCfgIoState *s = FW_CFG_IO(dev);
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
+/* when using port i/o, the 8-bit data register ALWAYS overlaps
+ * with half of the 16-bit control register. Hence, the total size
+ * of the i/o region used is FW_CFG_CTL_SIZE */
 memory_region_init_io(>comb_iomem, OBJECT(s), _cfg_comb_mem_ops,
   FW_CFG(s), "fwcfg", FW_CFG_CTL_SIZE);
 sysbus_add_io(sbd, s->iobase, >comb_iomem);
diff --git a/include/hw/nvram/fw_cfg.h b/include/hw/nvram/fw_cfg.h
index 664eaf6..2667ca9 100644
--- a/include/hw/nvram/fw_cfg.h
+++ b/include/hw/nvram/fw_cfg.h
@@ -46,6 +46,9 @@
 
 #define FW_CFG_INVALID  0x
 
+/* width in bytes of fw_cfg control register */
+#define FW_CFG_CTL_SIZE 0x02
+
 #define FW_CFG_MAX_FILE_PATH56
 
 #ifndef NO_QEMU_PROTOS
-- 
2.4.3




[Qemu-devel] [PATCH v6 0/5] add ACPI node for fw_cfg on pc and arm

2016-01-27 Thread Gabriel L. Somlo
New since v5:

- rebased on top of latest QEMU git master

Thanks,
  --Gabriel

>New since v4:
>
>   - rebased on top of Marc's DMA series
>   - drop machine compat dependency for insertion into x86/ssdt
> (patch 3/5), following agreement between Igor and Eduardo
>   - [mm]io register range now covers DMA register as well, if
> available.
>   - s/bios/firmware in doc file updates
>
>>New since v3:
>>
>>  - rebased to work on top of 87e896ab (introducing pc-*-25 classes),
>>inserting fw_cfg acpi node only for machines >= 2.5.
>>
>>  - reintroduce _STA with value 0x0B (bit 2 for u/i visibility turned
>>off to avoid Windows complaining -- thanks Igor for catching that!)
>>
>>If there's any other feedback besides questions regarding the
>>appropriateness of "QEMU0002" as the value of _HID, please don't hesitate!
>>
>>>New since v2:
>>>
>>> - pc/i386 node in ssdt only on machine types *newer* than 2.4
>>>   (as suggested by Eduardo)
>>>
>>>I appreciate any further comments and reviews. Hopefully we can make
>>>this palatable for upstream, modulo the lingering concerns about whether
>>>"QEMU0002" is ok to use as the value of _HID, which I'll hopefully get
>>>sorted out with the kernel crew...
>>>
New since v1:

- expose control register size (suggested by Marc Marí)

- leaving out _UID and _STA fields (thanks Shannon & Igor)

- using "QEMU0002" as the value of _HID (thanks Michael)

- added documentation blurb to docs/specs/fw_cfg.txt
  (mainly to record usage of the "QEMU0002" string with fw_cfg).

> This series adds a fw_cfg device node to the SSDT (on pc), or to the
> DSDT (on arm).
>
>   - Patch 1/3 moves (and renames) the BIOS_CFG_IOPORT (0x510)
> define from pc.c to pc.h, so that it could be used from
> acpi-build.c in patch 2/3.
> 
>   - Patch 2/3 adds a fw_cfg node to the pc SSDT.
> 
>   - Patch 3/3 adds a fw_cfg node to the arm DSDT.
>
> I made up some names - "FWCF" for the node name, and "FWCF0001"
> for _HID; no idea whether that's appropriate, or how else I should
> figure out what to use instead...
>
> Also, using scope "\\_SB", based on where fw_cfg shows up in the
> output of "info qtree". Again, if that's wrong, please point me in
> the right direction.
>
> Re. 3/3 (also mentioned after the commit blurb in the patch itself),
> I noticed none of the other DSDT entries contain a _STA field, wondering
> why it would (not) make sense to include that, same as on the PC.

Gabriel L. Somlo (5):
  fw_cfg: expose control register size in fw_cfg.h
  pc: fw_cfg: move ioport base constant to pc.h
  acpi: pc: add fw_cfg device node to ssdt
  acpi: arm: add fw_cfg device node to dsdt
  fw_cfg: document ACPI device node information

 docs/specs/fw_cfg.txt |  9 +
 hw/arm/virt-acpi-build.c  | 15 +++
 hw/i386/acpi-build.c  | 29 +
 hw/i386/pc.c  |  5 ++---
 hw/nvram/fw_cfg.c |  4 +++-
 include/hw/i386/pc.h  |  2 ++
 include/hw/nvram/fw_cfg.h |  3 +++
 7 files changed, 63 insertions(+), 4 deletions(-)

-- 
2.4.3




[Qemu-devel] [PATCH v6 3/5] acpi: pc: add fw_cfg device node to ssdt

2016-01-27 Thread Gabriel L. Somlo
Add a fw_cfg device node to the ACPI SSDT. While the guest-side
firmware can't utilize this information (since it has to access
the hard-coded fw_cfg device to extract ACPI tables to begin with),
having fw_cfg listed in ACPI will help the guest kernel keep a more
accurate inventory of in-use IO port regions.

Signed-off-by: Gabriel Somlo 
Reviewed-by: Laszlo Ersek 
Reviewed-by: Marc Marí 
---
 hw/i386/acpi-build.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 78758e2..8a9ae9d 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2100,6 +2100,35 @@ build_ssdt(GArray *table_data, GArray *linker,
 aml_append(scope, aml_name_decl("_S5", pkg));
 aml_append(ssdt, scope);
 
+/* create fw_cfg node, unconditionally */
+{
+/* when using port i/o, the 8-bit data register *always* overlaps
+ * with half of the 16-bit control register. Hence, the total size
+ * of the i/o region used is FW_CFG_CTL_SIZE; when using DMA, the
+ * DMA control register is located at FW_CFG_DMA_IO_BASE + 4 */
+uint8_t io_size = object_property_get_bool(OBJECT(guest_info->fw_cfg),
+   "dma_enabled", NULL) ?
+  ROUND_UP(FW_CFG_CTL_SIZE, 4) + sizeof(dma_addr_t) :
+  FW_CFG_CTL_SIZE;
+
+scope = aml_scope("\\_SB");
+dev = aml_device("FWCF");
+
+aml_append(dev, aml_name_decl("_HID", aml_string("QEMU0002")));
+
+/* device present, functioning, decoding, not shown in UI */
+aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
+
+crs = aml_resource_template();
+aml_append(crs,
+aml_io(AML_DECODE16, FW_CFG_IO_BASE, FW_CFG_IO_BASE, 0x01, io_size)
+);
+aml_append(dev, aml_name_decl("_CRS", crs));
+
+aml_append(scope, dev);
+aml_append(ssdt, scope);
+}
+
 if (misc->applesmc_io_base) {
 scope = aml_scope("\\_SB.PCI0.ISA");
 dev = aml_device("SMC");
-- 
2.4.3




[Qemu-devel] [PATCH v6 2/5] pc: fw_cfg: move ioport base constant to pc.h

2016-01-27 Thread Gabriel L. Somlo
Move BIOS_CFG_IOPORT define from pc.c to pc.h, and rename
it to FW_CFG_IO_BASE.

Cc: Marc Marí 
Signed-off-by: Gabriel Somlo 
Reviewed-by: Laszlo Ersek 
Reviewed-by: Marc Marí 
---
 hw/i386/pc.c | 5 ++---
 include/hw/i386/pc.h | 2 ++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 78cf8fa..aa79dd1 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -77,7 +77,6 @@
 #define DPRINTF(fmt, ...)
 #endif
 
-#define BIOS_CFG_IOPORT 0x510
 #define FW_CFG_ACPI_TABLES (FW_CFG_ARCH_LOCAL + 0)
 #define FW_CFG_SMBIOS_ENTRIES (FW_CFG_ARCH_LOCAL + 1)
 #define FW_CFG_IRQ0_OVERRIDE (FW_CFG_ARCH_LOCAL + 2)
@@ -755,7 +754,7 @@ static FWCfgState *bochs_bios_init(AddressSpace *as)
 int i, j;
 unsigned int apic_id_limit = pc_apic_id_limit(max_cpus);
 
-fw_cfg = fw_cfg_init_io_dma(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 4, as);
+fw_cfg = fw_cfg_init_io_dma(FW_CFG_IO_BASE, FW_CFG_IO_BASE + 4, as);
 
 /* FW_CFG_MAX_CPUS is a bit confusing/problematic on x86:
  *
@@ -1269,7 +1268,7 @@ FWCfgState *xen_load_linux(PCMachineState *pcms,
 
 assert(MACHINE(pcms)->kernel_filename != NULL);
 
-fw_cfg = fw_cfg_init_io(BIOS_CFG_IOPORT);
+fw_cfg = fw_cfg_init_io(FW_CFG_IO_BASE);
 rom_set_fw(fw_cfg);
 
 load_linux(pcms, fw_cfg);
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 65e8f24..0a4e0da 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -272,6 +272,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name);
 
 ISADevice *pc_find_fdc0(void);
 
+#define FW_CFG_IO_BASE 0x510
+
 /* acpi_piix.c */
 
 I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base,
-- 
2.4.3




Re: [Qemu-devel] [PATCH v8 01/16] block: Release dirty bitmaps in bdrv_close()

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> bdrv_delete() is not very happy about deleting BlockDriverStates with
> dirty bitmaps still attached to them. In the past, we got around that
> very easily by relying on bdrv_close_all() bypassing bdrv_delete(), and
> bdrv_close() simply ignoring that condition. We should fix that by
> releasing all dirty bitmaps in bdrv_close() and drop the assertion in
> bdrv_delete().
> 
> Signed-off-by: Max Reitz 
> Reviewed-by: John Snow 
> ---
>  block.c | 37 +
>  1 file changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 5709d3d..9a31e20 100644
> --- a/block.c
> +++ b/block.c
> @@ -88,6 +88,8 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const 
> char *filename,
>   const BdrvChildRole *child_role, Error **errp);
>  
>  static void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
> +static void bdrv_release_all_dirty_bitmaps(BlockDriverState *bs);
> +
>  /* If non-zero, use only whitelisted block drivers */
>  static int use_bdrv_whitelist;
>  
> @@ -2157,6 +2159,8 @@ void bdrv_close(BlockDriverState *bs)
>  
>  notifier_list_notify(>close_notifiers, bs);
>  
> +bdrv_release_all_dirty_bitmaps(bs);
> +
>  if (bs->blk) {
>  blk_dev_change_media_cb(bs->blk, false);
>  }
> @@ -2366,7 +2370,6 @@ static void bdrv_delete(BlockDriverState *bs)
>  assert(!bs->job);
>  assert(bdrv_op_blocker_is_empty(bs));
>  assert(!bs->refcnt);
> -assert(QLIST_EMPTY(>dirty_bitmaps));
>  
>  bdrv_close(bs);
>  
> @@ -3582,21 +3585,39 @@ static void 
> bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
>  }
>  }
>  
> -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
> +static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
> +  BdrvDirtyBitmap *bitmap)
>  {
>  BdrvDirtyBitmap *bm, *next;
>  QLIST_FOREACH_SAFE(bm, >dirty_bitmaps, list, next) {
> -if (bm == bitmap) {
> +if (!bitmap || bm == bitmap) {
>  assert(!bdrv_dirty_bitmap_frozen(bm));
> -QLIST_REMOVE(bitmap, list);
> -hbitmap_free(bitmap->bitmap);
> -g_free(bitmap->name);
> -g_free(bitmap);
> -return;
> +QLIST_REMOVE(bm, list);
> +hbitmap_free(bm->bitmap);
> +g_free(bm->name);
> +g_free(bm);
> +
> +if (bitmap) {
> +return;
> +}
>  }
>  }
>  }
>  
> +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
> +{
> +bdrv_do_release_matching_dirty_bitmap(bs, bitmap);
> +}
> +
> +/**
> + * Release all dirty bitmaps attached to a BDS (for use in bdrv_close()). 
> There
> + * must not be any frozen bitmaps attached.

Should we assert that? And IIUC the intention of this function is to release
all monitor owned (i.e. user created) dirty bitmaps, which must be named. If
so, can we assert that too?

Fam

> + */
> +static void bdrv_release_all_dirty_bitmaps(BlockDriverState *bs)
> +{
> +bdrv_do_release_matching_dirty_bitmap(bs, NULL);
> +}
> +
>  void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap)
>  {
>  assert(!bdrv_dirty_bitmap_frozen(bitmap));
> -- 
> 2.7.0
> 



Re: [Qemu-devel] [PATCH v8 05/16] virtio-scsi: Catch BDS-BB removal/insertion

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> Make use of the BDS-BB removal and insertion notifiers to remove or set
> up, respectively, virtio-scsi's op blockers.
> 
> Signed-off-by: Max Reitz 
> ---
>  hw/scsi/virtio-scsi.c   | 55 
> +
>  include/hw/virtio/virtio-scsi.h | 10 
>  2 files changed, 65 insertions(+)
> 
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index 607593c..b508b81 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -757,6 +757,22 @@ static void virtio_scsi_change(SCSIBus *bus, SCSIDevice 
> *dev, SCSISense sense)
>  }
>  }
>  
> +static void virtio_scsi_blk_insert_notifier(Notifier *n, void *data)
> +{
> +VirtIOSCSIBlkChangeNotifier *cn = DO_UPCAST(VirtIOSCSIBlkChangeNotifier,
> +n, n);
> +assert(cn->sd->conf.blk == data);
> +blk_op_block_all(cn->sd->conf.blk, cn->s->blocker);
> +}
> +
> +static void virtio_scsi_blk_remove_notifier(Notifier *n, void *data)
> +{
> +VirtIOSCSIBlkChangeNotifier *cn = DO_UPCAST(VirtIOSCSIBlkChangeNotifier,
> +n, n);
> +assert(cn->sd->conf.blk == data);
> +blk_op_unblock_all(cn->sd->conf.blk, cn->s->blocker);
> +}
> +
>  static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState 
> *dev,
>  Error **errp)
>  {
> @@ -765,6 +781,22 @@ static void virtio_scsi_hotplug(HotplugHandler 
> *hotplug_dev, DeviceState *dev,
>  SCSIDevice *sd = SCSI_DEVICE(dev);
>  
>  if (s->ctx && !s->dataplane_disabled) {
> +VirtIOSCSIBlkChangeNotifier *insert_notifier, *remove_notifier;
> +
> +insert_notifier = g_new0(VirtIOSCSIBlkChangeNotifier, 1);
> +insert_notifier->n.notify = virtio_scsi_blk_insert_notifier;
> +insert_notifier->s = s;
> +insert_notifier->sd = sd;
> +blk_add_insert_bs_notifier(sd->conf.blk, _notifier->n);
> +QTAILQ_INSERT_TAIL(>insert_notifiers, insert_notifier, next);
> +
> +remove_notifier = g_new0(VirtIOSCSIBlkChangeNotifier, 1);
> +remove_notifier->n.notify = virtio_scsi_blk_remove_notifier;
> +remove_notifier->s = s;
> +remove_notifier->sd = sd;
> +blk_add_remove_bs_notifier(sd->conf.blk, _notifier->n);
> +QTAILQ_INSERT_TAIL(>remove_notifiers, remove_notifier, next);
> +
>  if (blk_op_is_blocked(sd->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
>  return;
>  }
> @@ -787,6 +819,7 @@ static void virtio_scsi_hotunplug(HotplugHandler 
> *hotplug_dev, DeviceState *dev,
>  VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
>  VirtIOSCSI *s = VIRTIO_SCSI(vdev);
>  SCSIDevice *sd = SCSI_DEVICE(dev);
> +VirtIOSCSIBlkChangeNotifier *insert_notifier, *remove_notifier;
>  
>  if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
>  virtio_scsi_push_event(s, sd,
> @@ -797,6 +830,25 @@ static void virtio_scsi_hotunplug(HotplugHandler 
> *hotplug_dev, DeviceState *dev,
>  if (s->ctx) {
>  blk_op_unblock_all(sd->conf.blk, s->blocker);
>  }
> +
> +QTAILQ_FOREACH(insert_notifier, >insert_notifiers, next) {
> +if (insert_notifier->sd == sd) {
> +notifier_remove(_notifier->n);
> +QTAILQ_REMOVE(>insert_notifiers, insert_notifier, next);
> +g_free(insert_notifier);
> +break;
> +}
> +}
> +
> +QTAILQ_FOREACH(remove_notifier, >remove_notifiers, next) {
> +if (remove_notifier->sd == sd) {
> +notifier_remove(_notifier->n);
> +QTAILQ_REMOVE(>remove_notifiers, remove_notifier, next);
> +g_free(remove_notifier);
> +break;
> +}
> +}
> +
>  qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
>  }
>  
> @@ -911,6 +963,9 @@ static void virtio_scsi_device_realize(DeviceState *dev, 
> Error **errp)
>  add_migration_state_change_notifier(>migration_state_notifier);
>  
>  error_setg(>blocker, "block device is in use by data plane");
> +
> +QTAILQ_INIT(>insert_notifiers);
> +QTAILQ_INIT(>remove_notifiers);
>  }
>  
>  static void virtio_scsi_instance_init(Object *obj)
> diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
> index 088fe9f..0394eb2 100644
> --- a/include/hw/virtio/virtio-scsi.h
> +++ b/include/hw/virtio/virtio-scsi.h
> @@ -76,6 +76,13 @@ typedef struct VirtIOSCSICommon {
>  VirtQueue **cmd_vqs;
>  } VirtIOSCSICommon;
>  
> +typedef struct VirtIOSCSIBlkChangeNotifier {
> +Notifier n;
> +struct VirtIOSCSI *s;
> +SCSIDevice *sd;
> +QTAILQ_ENTRY(VirtIOSCSIBlkChangeNotifier) next;
> +} VirtIOSCSIBlkChangeNotifier;
> +
>  typedef struct VirtIOSCSI {
>  VirtIOSCSICommon parent_obj;
>  
> @@ -86,6 +93,9 @@ typedef struct VirtIOSCSI {
>  /* Fields for dataplane below */
>  AioContext *ctx; /* one iothread per 

Re: [Qemu-devel] [PATCH v8 08/16] block: Use blk_remove_bs() in blk_delete()

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> Signed-off-by: Max Reitz 
> Reviewed-by: Kevin Wolf 
> ---
>  block/block-backend.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 621787c..7f5ad59 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -166,10 +166,7 @@ static void blk_delete(BlockBackend *blk)
>  assert(!blk->refcnt);
>  assert(!blk->dev);
>  if (blk->bs) {
> -assert(blk->bs->blk == blk);
> -blk->bs->blk = NULL;
> -bdrv_unref(blk->bs);
> -blk->bs = NULL;
> +blk_remove_bs(blk);
>  }
>  assert(QLIST_EMPTY(>remove_bs_notifiers.notifiers));
>  assert(QLIST_EMPTY(>insert_bs_notifiers.notifiers));
> @@ -351,6 +348,8 @@ void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend 
> *blk)
>   */
>  void blk_remove_bs(BlockBackend *blk)
>  {
> +assert(blk->bs->blk == blk);
> +
>  notifier_list_notify(>remove_bs_notifiers, blk);
>  
>  blk_update_root_state(blk);
> -- 
> 2.7.0
> 

Reviewed-by: Fam Zheng 




Re: [Qemu-devel] [PATCH v8 09/16] blockdev: Use blk_remove_bs() in do_drive_del()

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> Signed-off-by: Max Reitz 
> Reviewed-by: Kevin Wolf 
> ---
>  blockdev.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/blockdev.c b/blockdev.c
> index 1044a6a..09d4621 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -2792,7 +2792,7 @@ void hmp_drive_del(Monitor *mon, const QDict *qdict)
>  return;
>  }
>  
> -bdrv_close(bs);
> +blk_remove_bs(blk);
>  }
>  
>  /* if we have a device attached to this BlockDriverState
> -- 
> 2.7.0
> 

Reviewed-by: Fam Zheng 



Re: [Qemu-devel] [PATCHv2 08/10] target-ppc: Add new TLB invalidate by HPTE call for hash64 MMUs

2016-01-27 Thread David Gibson
On Thu, Jan 28, 2016 at 03:33:18PM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> > When HPTEs are removed or modified by hypercalls on spapr, we need to
> > invalidate the relevant pages in the qemu TLB.
> > 
> > Currently we do that by doing some complicated calculations to work out the
> > right encoding for the tlbie instruction, then passing that to
> > ppc_tlb_invalidate_one()... which totally ignores the argument and flushes
> > the whole tlb.
> > 
> > Avoid that by adding a new flush-by-hpte helper in mmu-hash64.c.
> 
> Should we find a better "in between" so long run we implement tlbie
> properly ? IE, tlbie will give us the page size using the same encoding
> as the HPTE iirc when L=1 ? To be honest the encoding of tlbie in arch
> 2.07 is so completely insane I have a hard time figuring it out myself
> ... :-)

I'm not entirely sure what the better in-between would be.  Having the
pagesize in tlbie isn't enough on its own - the bigger problem is that
we need a way of invalidating a whole congruence class of entries in
the qemu TLB, which it doesn't currently provide a means to do.

> Otherwise,
> 
> Acked-by: Benjamin Herrenschmidt 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCHv2 06/10] target-ppc: Remove unused mmu models from ppc_tlb_invalidate_one

2016-01-27 Thread David Gibson
On Thu, Jan 28, 2016 at 03:20:38PM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> > ppc_tlb_invalidate_one() has a big switch handling many different MMU
> > types.  However, most of those branches can never be reached:
> > 
> > It is called from 3 places: from remove_hpte() and h_protect() in
> > spapr_hcall.c (which always has a 64-bit hash MMU type), and from
> > helper_tlbie() in mmu_helper.c.
> > 
> > Calls to helper_tlbie() are generated from gen_tlbiel, gen_tlbiel and
> > gen_tlbiva.  The first two are only used with the PPC_MEM_TLBIE flag,
> > set only with 32-bit or 64-bit hash MMU models, and gen_tlbiva() is
> > used only on 440 and 460 models with the BookE mmu model.
> > 
> > These means the exhaustive list of MMU types which may call
> > ppc_tlb_invalidate_one() is: POWERPC_MMU_SOFT_6xx, POWERPC_MMU_601,
> > POWERPC_MMU_32B, POWERPC_MMU_SOFT_74xx, POWERPC_MMU_64B,
> > POWERPC_MMU_2_03,
> > POWERPC_MMU_2_06, POWERPC_MMU_2_07 and POWERPC_MMU_BOOKE.
> > 
> > Clean up by removing logic for all other MMU types from
> > ppc_tlb_invalidate_one().
> 
> I would argue to move hash64 out of it as well anyway. First what we do
> in there is dumb, but the way I change it with lazy inval differs and
> tlbie does provide additional information on server processors that
> we would need should we chose to implemented fine grained invalidations
> (such as the page size).

I agree, but I didn't want to postpone the current things I'm working
on while I did that.
 
> In the meantime:
> 
> Acked-by: Benjamin Herrenschmidt 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[Qemu-devel] [PATCH v7 3/4] firmware: create directory hierarchy for sysfs fw_cfg entries

2016-01-27 Thread Gabriel L. Somlo
From: Gabriel Somlo 

Each fw_cfg entry of type "file" has an associated 56-char,
nul-terminated ASCII string which represents its name. While
the fw_cfg device doesn't itself impose any specific naming
convention, QEMU developers have traditionally used path name
semantics (i.e. "etc/acpi/rsdp") to descriptively name the
various fw_cfg "blobs" passed into the guest.

This patch attempts, on a best effort basis, to create a
directory hierarchy representing the content of fw_cfg file
names, under /sys/firmware/qemu_fw_cfg/by_name.

Upon successful creation of all directories representing the
"dirname" portion of a fw_cfg file, a symlink will be created
to represent the "basename", pointing at the appropriate
/sys/firmware/qemu_fw_cfg/by_key entry. If a file name is not
suitable for this procedure (e.g., if its basename or dirname
components collide with an already existing dirname component
or basename, respectively) the corresponding fw_cfg blob is
skipped and will remain available in sysfs only by its selector
key value.

Signed-off-by: Gabriel Somlo 
Cc: Andy Lutomirski 
---
 .../ABI/testing/sysfs-firmware-qemu_fw_cfg |  42 
 drivers/firmware/qemu_fw_cfg.c | 109 -
 2 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg 
b/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
index e9e58d4..011dda4 100644
--- a/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
+++ b/Documentation/ABI/testing/sysfs-firmware-qemu_fw_cfg
@@ -56,3 +56,45 @@ Description:
  entry via the control register, and reading a number
  of bytes equal to the blob size from the data
  register.
+
+   --- Listing fw_cfg blobs by file name ---
+
+   While the fw_cfg device does not impose any specific naming
+   convention on the blobs registered in the file directory,
+   QEMU developers have traditionally used path name semantics
+   to give each blob a descriptive name. For example:
+
+   "bootorder"
+   "genroms/kvmvapic.bin"
+   "etc/e820"
+   "etc/boot-fail-wait"
+   "etc/system-states"
+   "etc/table-loader"
+   "etc/acpi/rsdp"
+   "etc/acpi/tables"
+   "etc/smbios/smbios-tables"
+   "etc/smbios/smbios-anchor"
+   ...
+
+   In addition to the listing by unique selector key described
+   above, the fw_cfg sysfs driver also attempts to build a tree
+   of directories matching the path name components of fw_cfg
+   blob names, ending in symlinks to the by_key entry for each
+   "basename", as illustrated below (assume current directory is
+   /sys/firmware):
+
+   qemu_fw_cfg/by_name/bootorder -> ../by_key/38
+   qemu_fw_cfg/by_name/etc/e820 -> ../../by_key/35
+   qemu_fw_cfg/by_name/etc/acpi/rsdp -> ../../../by_key/41
+   ...
+
+   Construction of the directory tree and symlinks is done on a
+   "best-effort" basis, as there is no guarantee that components
+   of fw_cfg blob names are always "well behaved". I.e., there is
+   the possibility that a symlink (basename) will conflict with
+   a dirname component of another fw_cfg blob, in which case the
+   creation of the offending /sys/firmware/qemu_fw_cfg/by_name
+   entry will be skipped.
+
+   The authoritative list of entries will continue to be found
+   under the /sys/firmware/qemu_fw_cfg/by_key directory.
diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c
index 83e8a5c..19f6851 100644
--- a/drivers/firmware/qemu_fw_cfg.c
+++ b/drivers/firmware/qemu_fw_cfg.c
@@ -334,9 +334,103 @@ static struct bin_attribute fw_cfg_sysfs_attr_raw = {
.read = fw_cfg_sysfs_read_raw,
 };
 
-/* kobjects representing top-level and by_key folders */
+/*
+ * Create a kset subdirectory matching each '/' delimited dirname token
+ * in 'name', starting with sysfs kset/folder 'dir'; At the end, create
+ * a symlink directed at the given 'target'.
+ * NOTE: We do this on a best-effort basis, since 'name' is not guaranteed
+ * to be a well-behaved path name. Whenever a symlink vs. kset directory
+ * name collision occurs, the kernel will issue big scary warnings while
+ * refusing to add the offending link or directory. We follow up with our
+ * own, slightly less scary error messages explaining the situation :)
+ */
+static int fw_cfg_build_symlink(struct kset *dir,
+   struct kobject *target, 

[Qemu-devel] [PATCH v7 0/4] SysFS driver for QEMU fw_cfg device

2016-01-27 Thread Gabriel L. Somlo
From: "Gabriel Somlo" 

Allow access to QEMU firmware blobs, passed into the guest VM via
the fw_cfg device, through SysFS entries. Blob meta-data (e.g. name,
size, and fw_cfg key), as well as the raw binary blob data may be
accessed.

The SysFS access location is /sys/firmware/qemu_fw_cfg/... and was
selected based on overall similarity to the type of information
exposed under /sys/firmware/dmi/entries/...

This functionality is primarily intended to serve as a host->guest
configuration data transfer mechanism that is both:

- asynchronous: the host doesn't need to wait for the guest
to be ready to accept data (e.g., by starting
an agent daemon)

- out-of-band:  there is no need to commandeer a guest element
normally visible and available to the guest user
(e.g., kernel cmdline, mounting floppy/cdrom, etc.)

QEMU now allows arbitrary fw_cfg blobs to be added via the command line,
so it would be nice to make them easy to retrieve from within the guest
OS, and the niceset and easiest way I can think of is

cat /sys/firmware/qemu-fw-cfg/...//raw

New since v6:

- added architecture-specific default values for fw_cfg register
  offsets: DT and/or ACPI will only give us the base address and
  total size of the fw_cfg register set, but not individual register
  offsets *within* this total extent. The specific offsets are
  different across architectures, and this version adds #defines
  so that reasonable defaults can be used on each supported platform.

Thanks,
  --Gabriel

>New since v5:
>
>   - fixed typos in documentation files (Patches 1/4 and 4/4
>
>   - printf/scanf type modifier for phys_addr_t now matches
> arch-specific width (u32 vs. u64), avoiding compiler warnings.
> (tested on i386 with and without PAE, and on armv7hl with and
>  without lpae -- the latter pair took quite a while on an
>  emulated QEMU guest :) )
>
>>New since v4:
>>
>>  Documentation (Patches 1/4 and 4/4) now points to the authoritative
>>  file in the QEMU source tree for any details related to the "hardware
>>  interface" of the fw_cfg device; Only details specific to sysfs (1/4) 
>>  and DT (4/4) should stay in the kernel docs.
>>
>>>New (since v3):
>>>
>>> Patch 1/4: Device probing now works with either ACPI, DT, or
>>>optionally by manually specifying a base, size, and
>>>register offsets on the command line. This way, all
>>>architectures offering fw_cfg can be supported, although
>>>x86 and ARM get *automatic* support via ACPI and/or DT.
>>>
>>>HUGE thanks to Laszlo Ersek  for
>>>pointing out drivers/virtio/virtio_mmio.c, as an example
>>>on how to pull this off !!!
>>>
>>>Stefan: I saw Marc's DMA patches to fw_cfg. Since only
>>>x86 and ARM will support it starting with QEMU 2.5, and
>>>since I expect to get lots of otherwise interesting (but
>>>otherwise orthogonal) feedback on this series, I'd like
>>>to stick with ioread8() across the board for now. We can
>>>always patch in DMA support in a backward compatible way
>>>later, once this series gets (hopefully) accepted :)
>>>
>>> Patch 2/4: (was 3/4 in v3): unchanged. Exports kset_find_obj() so
>>>modules can call it.
>>>
>>> Patch 3/4: (was 4/4 in v3): rebased, but otherwise the same.
>>>Essentially, creates a "human readable" directory
>>>hierarchy from "path-like" tokens making up fw_cfg
>>>blob names. I'm not really sure there's a way to make
>>>this happen via udev rules, but I have at least one
>>>potential use case for doing it *before* udev becomes
>>>available (cc: Andy Lutomirski ),
>>>so I'd be happy to leave this functionality in the
>>>kernel module. See further below for an illustration
>>>of this.
>>>
>>> Patch 4/4: Updates the existing ARM DT documentation for fw_cfg,
>>>mainly by pointing at the more comprehensive document
>>>introduced with Patch 1/4 for details on the fw_cfg
>>>device interface, leaving only the specific ARM/DT
>>>address/size node information in place.
>>>
  In addition to the "by_key" blob listing, e.g.:
  
  $ tree /sys/firmware/qemu_fw_cfg/
  /sys/firmware/qemu_fw_cfg/
  |-- by_key
  |   |-- 32
  |   |   |-- key
  |   |   |-- name("etc/boot-fail-wait")
  |   |   |-- raw
  |   |   `-- size
  |   |-- 33
  |   |   |-- key
  |   |  

[Qemu-devel] [PATCH v7 4/4] devicetree: update documentation for fw_cfg ARM bindings

2016-01-27 Thread Gabriel L. Somlo
From: Gabriel Somlo 

Remove fw_cfg hardware interface details from
Documentation/devicetree/bindings/arm/fw-cfg.txt,
and replace them with a pointer to the authoritative
documentation in the QEMU source tree.

Signed-off-by: Gabriel Somlo 
Cc: Laszlo Ersek 
Acked-by: Rob Herring 
Reviewed-by: Laszlo Ersek 
---
 Documentation/devicetree/bindings/arm/fw-cfg.txt | 38 ++--
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/fw-cfg.txt 
b/Documentation/devicetree/bindings/arm/fw-cfg.txt
index 953fb64..fd54e1d 100644
--- a/Documentation/devicetree/bindings/arm/fw-cfg.txt
+++ b/Documentation/devicetree/bindings/arm/fw-cfg.txt
@@ -11,43 +11,9 @@ QEMU exposes the control and data register to ARM guests as 
memory mapped
 registers; their location is communicated to the guest's UEFI firmware in the
 DTB that QEMU places at the bottom of the guest's DRAM.
 
-The guest writes a selector value (a key) to the selector register, and then
-can read the corresponding data (produced by QEMU) via the data register. If
-the selected entry is writable, the guest can rewrite it through the data
-register.
+The authoritative guest-side hardware interface documentation to the fw_cfg
+device can be found in "docs/specs/fw_cfg.txt" in the QEMU source tree.
 
-The selector register takes keys in big endian byte order.
-
-The data register allows accesses with 8, 16, 32 and 64-bit width (only at
-offset 0 of the register). Accesses larger than a byte are interpreted as
-arrays, bundled together only for better performance. The bytes constituting
-such a word, in increasing address order, correspond to the bytes that would
-have been transferred by byte-wide accesses in chronological order.
-
-The interface allows guest firmware to download various parameters and blobs
-that affect how the firmware works and what tables it installs for the guest
-OS. For example, boot order of devices, ACPI tables, SMBIOS tables, kernel and
-initrd images for direct kernel booting, virtual machine UUID, SMP information,
-virtual NUMA topology, and so on.
-
-The authoritative registry of the valid selector values and their meanings is
-the QEMU source code; the structure of the data blobs corresponding to the
-individual key values is also defined in the QEMU source code.
-
-The presence of the registers can be verified by selecting the "signature" blob
-with key 0x, and reading four bytes from the data register. The returned
-signature is "QEMU".
-
-The outermost protocol (involving the write / read sequences of the control and
-data registers) is expected to be versioned, and/or described by feature bits.
-The interface revision / feature bitmap can be retrieved with key 0x0001. The
-blob to be read from the data register has size 4, and it is to be interpreted
-as a uint32_t value in little endian byte order. The current value
-(corresponding to the above outer protocol) is zero.
-
-The guest kernel is not expected to use these registers (although it is
-certainly allowed to); the device tree bindings are documented here because
-this is where device tree bindings reside in general.
 
 Required properties:
 
-- 
2.4.3




Re: [Qemu-devel] [PATCH v8 04/16] virtio-blk: Functions for op blocker management

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> Put the code for setting up and removing op blockers into an own
> function, respectively. Then, we can invoke those functions whenever a
> BDS is removed from an virtio-blk BB or inserted into it.
> 
> Signed-off-by: Max Reitz 
> ---
>  hw/block/dataplane/virtio-blk.c | 77 
> +++--
>  1 file changed, 59 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index bc34046..ee0c4d4 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -40,6 +40,8 @@ struct VirtIOBlockDataPlane {
>  EventNotifier *guest_notifier;  /* irq */
>  QEMUBH *bh; /* bh for guest notification */
>  
> +Notifier insert_notifier, remove_notifier;
> +
>  /* Note that these EventNotifiers are assigned by value.  This is
>   * fine as long as you do not call event_notifier_cleanup on them
>   * (because you don't own the file descriptor or handle; you just
> @@ -137,6 +139,54 @@ static void handle_notify(EventNotifier *e)
>  blk_io_unplug(s->conf->conf.blk);
>  }
>  
> +static void data_plane_set_up_op_blockers(VirtIOBlockDataPlane *s)
> +{
> +assert(!s->blocker);
> +error_setg(>blocker, "block device is in use by data plane");
> +blk_op_block_all(s->conf->conf.blk, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_RESIZE, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_DRIVE_DEL, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
> s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_CHANGE, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
> s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_COMMIT_TARGET, 
> s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_EJECT, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
> +   s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
> +   s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
> +   s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
> s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
> +blk_op_unblock(s->conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
> +}
> +
> +static void data_plane_remove_op_blockers(VirtIOBlockDataPlane *s)
> +{
> +if (s->blocker) {
> +blk_op_unblock_all(s->conf->conf.blk, s->blocker);
> +error_free(s->blocker);
> +s->blocker = NULL;
> +}
> +}
> +
> +static void data_plane_blk_insert_notifier(Notifier *n, void *data)
> +{
> +VirtIOBlockDataPlane *s = container_of(n, VirtIOBlockDataPlane,
> +   insert_notifier);
> +assert(s->conf->conf.blk == data);
> +data_plane_set_up_op_blockers(s);
> +}
> +
> +static void data_plane_blk_remove_notifier(Notifier *n, void *data)
> +{
> +VirtIOBlockDataPlane *s = container_of(n, VirtIOBlockDataPlane,
> +   remove_notifier);
> +assert(s->conf->conf.blk == data);
> +data_plane_remove_op_blockers(s);
> +}
> +
>  /* Context: QEMU global mutex held */
>  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
>VirtIOBlockDataPlane **dataplane,
> @@ -179,22 +229,12 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, 
> VirtIOBlkConf *conf,
>  s->ctx = iothread_get_aio_context(s->iothread);
>  s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
>  
> -error_setg(>blocker, "block device is in use by data plane");
> -blk_op_block_all(conf->conf.blk, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_RESIZE, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DRIVE_DEL, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_CHANGE, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_COMMIT_TARGET, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_EJECT, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, 
> s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, 
> s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
> -   s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
> -blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
> +   

Re: [Qemu-devel] [PATCH v8 14/16] block: Rewrite bdrv_close_all()

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> This patch rewrites bdrv_close_all(): Until now, all root BDSs have been
> force-closed. This is bad because it can lead to cached data not being
> flushed to disk.
> 
> Instead, try to make all reference holders relinquish their reference
> voluntarily:
> 
> 1. All BlockBackend users are handled by making all BBs simply eject
>their BDS tree. Since a BDS can never be on top of a BB, this will
>not cause any of the issues as seen with the force-closing of BDSs.
>The references will be relinquished and any further access to the BB
>will fail gracefully.
> 2. All BDSs which are owned by the monitor itself (because they do not
>have a BB) are relinquished next.
> 3. Besides BBs and the monitor, block jobs and other BDSs are the only
>things left that can hold a reference to BDSs. After every remaining
>block job has been canceled, there should not be any BDSs left (and
>the loop added here will always terminate (as long as NDEBUG is not
>defined), because either all_bdrv_states will be empty or there will
>not be any block job left to cancel, failing the assertion).
> 
> Signed-off-by: Max Reitz 
> Reviewed-by: Kevin Wolf 
> ---
>  block.c | 45 +
>  1 file changed, 37 insertions(+), 8 deletions(-)
> 
> diff --git a/block.c b/block.c
> index f8dd4a3..478e0db 100644
> --- a/block.c
> +++ b/block.c
> @@ -2145,9 +2145,7 @@ static void bdrv_close(BlockDriverState *bs)
>  {
>  BdrvAioNotifier *ban, *ban_next;
>  
> -if (bs->job) {
> -block_job_cancel_sync(bs->job);
> -}
> +assert(!bs->job);
>  
>  /* Disable I/O limits and drain all pending throttled requests */
>  if (bs->throttle_state) {
> @@ -2213,13 +2211,44 @@ static void bdrv_close(BlockDriverState *bs)
>  void bdrv_close_all(void)
>  {
>  BlockDriverState *bs;
> +AioContext *aio_context;
> +int original_refcount = 0;
>  
> -QTAILQ_FOREACH(bs, _states, device_list) {
> -AioContext *aio_context = bdrv_get_aio_context(bs);
> +/* Drop references from requests still in flight, such as canceled block
> + * jobs whose AIO context has not been polled yet */
> +bdrv_drain_all();
>  
> -aio_context_acquire(aio_context);
> -bdrv_close(bs);
> -aio_context_release(aio_context);
> +blockdev_close_all_bdrv_states();
> +blk_remove_all_bs();

This (monitor before BB) doesn't match the order in the commit message (BB
before monitor).

> +
> +/* Cancel all block jobs */
> +while (!QTAILQ_EMPTY(_bdrv_states)) {
> +QTAILQ_FOREACH(bs, _bdrv_states, bs_list) {
> +aio_context = bdrv_get_aio_context(bs);
> +
> +aio_context_acquire(aio_context);
> +if (bs->job) {
> +/* So we can safely query the current refcount */
> +bdrv_ref(bs);
> +original_refcount = bs->refcnt;
> +
> +block_job_cancel_sync(bs->job);
> +aio_context_release(aio_context);
> +break;
> +}
> +aio_context_release(aio_context);
> +}
> +
> +/* All the remaining BlockDriverStates are referenced directly or
> + * indirectly from block jobs, so there needs to be at least one BDS
> + * directly used by a block job */
> +assert(bs);
> +
> +/* Wait for the block job to release its reference */
> +while (bs->refcnt >= original_refcount) {
> +aio_poll(aio_context, true);

Why is this safe without acquiring aio_context? But oh wait, completions of
block jobs are defered to main loop BH, so I think to release the reference,
aio_poll(qemu_get_aio_context(), ...) is the right thing to do.

This is also the problem in block_job_cancel_sync, which can dead loop waiting
for job->completed flag, without processing main loop BH.

Fam

> +}
> +bdrv_unref(bs);
>  }
>  }
>  
> -- 
> 2.7.0
> 



Re: [Qemu-devel] [PATCHv2 03/10] target-ppc: Rework ppc_store_slb

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> ppc_store_slb updates the SLB for PPC cpus with 64-bit hash MMUs.
> Currently it takes two parameters, which contain values encoded as
> the
> register arguments to the slbmte instruction, one register contains
> the
> ESID portion of the SLBE and also the slot number, the other contains
> the
> VSID portion of the SLBE.
> 
> We're shortly going to want to do some SLB updates from other code
> where
> it is more convenient to supply the slot number and ESID separately,
> so
> rework this function and its callers to work this way.
> 
> As a bonus, this slightly simplifies the emulation of segment
> registers for
> when running a 32-bit OS on a 64-bit CPU.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

> ---
>  target-ppc/kvm.c|  2 +-
>  target-ppc/mmu-hash64.c | 24 +---
>  target-ppc/mmu-hash64.h |  3 ++-
>  target-ppc/mmu_helper.c | 14 +-
>  4 files changed, 21 insertions(+), 22 deletions(-)
> 
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 98d7ba6..0f45380 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -1205,7 +1205,7 @@ int kvm_arch_get_registers(CPUState *cs)
>   * Only restore valid entries
>   */
>  if (rb & SLB_ESID_V) {
> -ppc_store_slb(cpu, rb, rs);
> +ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs);
>  }
>  }
>  #endif
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 03e25fd..6e05643 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -135,28 +135,30 @@ void helper_slbie(CPUPPCState *env,
> target_ulong addr)
>  }
>  }
>  
> -int ppc_store_slb(PowerPCCPU *cpu, target_ulong rb, target_ulong rs)
> +int ppc_store_slb(PowerPCCPU *cpu, target_ulong slot,
> +  target_ulong esid, target_ulong vsid)
>  {
>  CPUPPCState *env = >env;
> -int slot = rb & 0xfff;
>  ppc_slb_t *slb = >slb[slot];
>  
> -if (rb & (0x1000 - env->slb_nr)) {
> -return -1; /* Reserved bits set or slot too high */
> +if (slot >= env->slb_nr) {
> +return -1; /* Bad slot number */
> +}
> +if (esid & ~(SLB_ESID_ESID | SLB_ESID_V)) {
> +return -1; /* Reserved bits set */
>  }
> -if (rs & (SLB_VSID_B & ~SLB_VSID_B_1T)) {
> +if (vsid & (SLB_VSID_B & ~SLB_VSID_B_1T)) {
>  return -1; /* Bad segment size */
>  }
> -if ((rs & SLB_VSID_B) && !(env->mmu_model & POWERPC_MMU_1TSEG))
> {
> +if ((vsid & SLB_VSID_B) && !(env->mmu_model &
> POWERPC_MMU_1TSEG)) {
>  return -1; /* 1T segment on MMU that doesn't support it */
>  }
>  
> -/* Mask out the slot number as we store the entry */
> -slb->esid = rb & (SLB_ESID_ESID | SLB_ESID_V);
> -slb->vsid = rs;
> +slb->esid = esid;
> +slb->vsid = vsid;
>  
>  LOG_SLB("%s: %d " TARGET_FMT_lx " - " TARGET_FMT_lx " => %016"
> PRIx64
> -" %016" PRIx64 "\n", __func__, slot, rb, rs,
> +" %016" PRIx64 "\n", __func__, slot, esid, vsid,
>  slb->esid, slb->vsid);
>  
>  return 0;
> @@ -196,7 +198,7 @@ void helper_store_slb(CPUPPCState *env,
> target_ulong rb, target_ulong rs)
>  {
>  PowerPCCPU *cpu = ppc_env_get_cpu(env);
>  
> -if (ppc_store_slb(cpu, rb, rs) < 0) {
> +if (ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs) < 0) {
>  helper_raise_exception_err(env, POWERPC_EXCP_PROGRAM,
> POWERPC_EXCP_INVAL);
>  }
> diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
> index 6e3de7e..24fd2c4 100644
> --- a/target-ppc/mmu-hash64.h
> +++ b/target-ppc/mmu-hash64.h
> @@ -6,7 +6,8 @@
>  #ifdef TARGET_PPC64
>  void ppc_hash64_check_page_sizes(PowerPCCPU *cpu, Error **errp);
>  void dump_slb(FILE *f, fprintf_function cpu_fprintf, PowerPCCPU
> *cpu);
> -int ppc_store_slb(PowerPCCPU *cpu, target_ulong rb, target_ulong
> rs);
> +int ppc_store_slb(PowerPCCPU *cpu, target_ulong slot,
> +  target_ulong esid, target_ulong vsid);
>  hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU *cpu, target_ulong
> addr);
>  int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, target_ulong
> address, int rw,
>  int mmu_idx);
> diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> index 0ab73bc..c040b17 100644
> --- a/target-ppc/mmu_helper.c
> +++ b/target-ppc/mmu_helper.c
> @@ -2088,21 +2088,17 @@ void helper_store_sr(CPUPPCState *env,
> target_ulong srnum, target_ulong value)
>  (int)srnum, value, env->sr[srnum]);
>  #if defined(TARGET_PPC64)
>  if (env->mmu_model & POWERPC_MMU_64) {
> -uint64_t rb = 0, rs = 0;
> +uint64_t esid, vsid;
>  
>  /* ESID = srnum */
> -rb |= ((uint32_t)srnum & 0xf) << 28;
> -/* Set the valid bit */
> -rb |= SLB_ESID_V;
> 

Re: [Qemu-devel] [PATCHv2 04/10] target-ppc: Rework SLB page size lookup

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> Currently, the ppc_hash64_page_shift() function looks up a page size
> based
> on information in an SLB entry.  It open codes the bit translation
> for
> existing CPUs, however different CPU models can have different SLB
> encodings.  We already store those in the 'sps' table in CPUPPCState,
> but
> we don't currently enforce that that actually matches the logic in
> ppc_hash64_page_shift.
> 
> This patch reworks lookup of page size from SLB in several ways:
>   * ppc_store_slb() will now fail (triggering an illegal instruction
> exception) if given a bad SLB page size encoding
>   * On success ppc_store_slb() stores a pointer to the relevant entry
> in
> the page size table in the SLB entry.  This is looked up directly
> from
> the published table of page size encodings, so can't get out ot
> sync.
>   * ppc_hash64_htab_lookup() and others now use this precached page
> size
> information rather than decoding the SLB values
>   * Now that callers have easy access to the page_shift,
> ppc_hash64_pte_raddr() amounts to just a deposit64(), so remove
> it and
> have the callers use deposit64() directly.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

> ---
>  target-ppc/cpu.h|  1 +
>  target-ppc/machine.c| 20 +
>  target-ppc/mmu-hash64.c | 74 +++--
> 
>  3 files changed, 56 insertions(+), 39 deletions(-)
> 
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index 2bc96b4..0820390 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -419,6 +419,7 @@ typedef struct ppc_slb_t ppc_slb_t;
>  struct ppc_slb_t {
>  uint64_t esid;
>  uint64_t vsid;
> +const struct ppc_one_seg_page_size *sps;
>  };
>  
>  #define MAX_SLB_ENTRIES 64
> diff --git a/target-ppc/machine.c b/target-ppc/machine.c
> index b61c060..ca62d3e 100644
> --- a/target-ppc/machine.c
> +++ b/target-ppc/machine.c
> @@ -2,6 +2,7 @@
>  #include "hw/boards.h"
>  #include "sysemu/kvm.h"
>  #include "helper_regs.h"
> +#include "mmu-hash64.h"
>  
>  static int cpu_load_old(QEMUFile *f, void *opaque, int version_id)
>  {
> @@ -352,11 +353,30 @@ static bool slb_needed(void *opaque)
>  return (cpu->env.mmu_model & POWERPC_MMU_64);
>  }
>  
> +static int slb_post_load(void *opaque, int version_id)
> +{
> +PowerPCCPU *cpu = opaque;
> +CPUPPCState *env = >env;
> +int i;
> +
> +/* We've pulled in the raw esid and vsid values from the
> migration
> + * stream, but we need to recompute the page size pointers */
> +for (i = 0; i < env->slb_nr; i++) {
> +if (ppc_store_slb(cpu, i, env->slb[i].esid, env-
> >slb[i].vsid) < 0) {
> +/* Migration source had bad values in its SLB */
> +return -1;
> +}
> +}
> +
> +return 0;
> +}
> +
>  static const VMStateDescription vmstate_slb = {
>  .name = "cpu/slb",
>  .version_id = 1,
>  .minimum_version_id = 1,
>  .needed = slb_needed,
> +.post_load = slb_post_load,
>  .fields = (VMStateField[]) {
>  VMSTATE_INT32_EQUAL(env.slb_nr, PowerPCCPU),
>  VMSTATE_SLB_ARRAY(env.slb, PowerPCCPU, MAX_SLB_ENTRIES),
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 6e05643..b784791 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -19,6 +19,7 @@
>   */
>  #include "cpu.h"
>  #include "exec/helper-proto.h"
> +#include "qemu/error-report.h"
>  #include "sysemu/kvm.h"
>  #include "kvm_ppc.h"
>  #include "mmu-hash64.h"
> @@ -140,6 +141,8 @@ int ppc_store_slb(PowerPCCPU *cpu, target_ulong
> slot,
>  {
>  CPUPPCState *env = >env;
>  ppc_slb_t *slb = >slb[slot];
> +const struct ppc_one_seg_page_size *sps = NULL;
> +int i;
>  
>  if (slot >= env->slb_nr) {
>  return -1; /* Bad slot number */
> @@ -154,8 +157,29 @@ int ppc_store_slb(PowerPCCPU *cpu, target_ulong
> slot,
>  return -1; /* 1T segment on MMU that doesn't support it */
>  }
>  
> +for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> +const struct ppc_one_seg_page_size *sps1 = >sps.sps[i];
> +
> +if (!sps1->page_shift) {
> +break;
> +}
> +
> +if ((vsid & SLB_VSID_LLP_MASK) == sps1->slb_enc) {
> +sps = sps1;
> +break;
> +}
> +}
> +
> +if (!sps) {
> +error_report("Bad page size encoding in SLB store: slot
> "TARGET_FMT_lu
> + " esid 0x"TARGET_FMT_lx" vsid 0x"TARGET_FMT_lx,
> + slot, esid, vsid);
> +return -1;
> +}
> +
>  slb->esid = esid;
>  slb->vsid = vsid;
> +slb->sps = sps;
>  
>  LOG_SLB("%s: %d " TARGET_FMT_lx " - " TARGET_FMT_lx " => %016"
> PRIx64
>  " %016" PRIx64 "\n", __func__, slot, esid, vsid,
> @@ -394,24 +418,6 @@ static hwaddr ppc_hash64_pteg_search(PowerPCCPU
> 

Re: [Qemu-devel] [PATCHv2 05/10] target-ppc: Use actual page size encodings from HPTE

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> At present the 64-bit hash MMU code uses information from the SLB to
> determine the page size of a translation.  We do need that
> information to
> correctly look up the hash table.  However the MMU also allows a
> possibly larger page size to be encoded into the HPTE itself, which
> is used
> to populate the TLB.  At present qemu doesn't check that, and so
> doesn't
> support the MPSS "Multiple Page Size per Segment" feature.
> 
> This makes a start on allowing this, by adding an hpte_page_shift()
> function which looks up the page size of an HPTE.  We use this to
> validate
> page sizes encodings on faults, and populate the qemu TLB with larger
> page sizes when appropriate.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

(Note that we don't actually populate the QEMU TLB with alrger page
sizes, it doesn't support it ... Also it tries to keep track of
the presence of large pages to deal with targetted invalidations
but we don't do the latter on hash64 and we wouldn't need that
extra care anyway since our tlbie carries the page size as well).

Cheers,
Ben.

> ---
>  target-ppc/mmu-hash64.c | 63
> ++---
>  1 file changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index b784791..ee1e8bf 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -21,6 +21,7 @@
>  #include "exec/helper-proto.h"
>  #include "qemu/error-report.h"
>  #include "sysemu/kvm.h"
> +#include "qemu/error-report.h"
>  #include "kvm_ppc.h"
>  #include "mmu-hash64.h"
>  
> @@ -474,12 +475,50 @@ static hwaddr ppc_hash64_htab_lookup(PowerPCCPU
> *cpu,
>  return pte_offset;
>  }
>  
> +static unsigned hpte_page_shift(const struct ppc_one_seg_page_size
> *sps,
> +uint64_t pte0, uint64_t pte1)
> +{
> +int i;
> +
> +if (!(pte0 & HPTE64_V_LARGE)) {
> +if (sps->page_shift != 12) {
> +/* 4kiB page in a non 4kiB segment */
> +return 0;
> +}
> +/* Normal 4kiB page */
> +return 12;
> +}
> +
> +for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) {
> +const struct ppc_one_page_size *ps = >enc[i];
> +uint64_t mask;
> +
> +if (!ps->page_shift) {
> +break;
> +}
> +
> +if (ps->page_shift == 12) {
> +/* L bit is set so this can't be a 4kiB page */
> +continue;
> +}
> +
> +mask = ((1ULL << ps->page_shift) - 1) & HPTE64_R_RPN;
> +
> +if ((pte1 & mask) == (ps->pte_enc << HPTE64_R_RPN_SHIFT)) {
> +return ps->page_shift;
> +}
> +}
> +
> +return 0; /* Bad page size encoding */
> +}
> +
>  int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu, target_ulong eaddr,
>  int rwx, int mmu_idx)
>  {
>  CPUState *cs = CPU(cpu);
>  CPUPPCState *env = >env;
>  ppc_slb_t *slb;
> +unsigned apshift;
>  hwaddr pte_offset;
>  ppc_hash_pte64_t pte;
>  int pp_prot, amr_prot, prot;
> @@ -543,6 +582,18 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU *cpu,
> target_ulong eaddr,
>  qemu_log_mask(CPU_LOG_MMU,
>  "found PTE at offset %08" HWADDR_PRIx "\n",
> pte_offset);
>  
> +/* Validate page size encoding */
> +apshift = hpte_page_shift(slb->sps, pte.pte0, pte.pte1);
> +if (!apshift) {
> +error_report("Bad page size encoding in HPTE 0x%"PRIx64" -
> 0x%"PRIx64
> + " @ 0x%"HWADDR_PRIx, pte.pte0, pte.pte1,
> pte_offset);
> +/* Not entirely sure what the right action here, but machine
> + * check seems reasonable */
> +cs->exception_index = POWERPC_EXCP_MCHECK;
> +env->error_code = 0;
> +return 1;
> +}
> +
>  /* 5. Check access permissions */
>  
>  pp_prot = ppc_hash64_pte_prot(cpu, slb, pte);
> @@ -595,10 +646,10 @@ int ppc_hash64_handle_mmu_fault(PowerPCCPU
> *cpu, target_ulong eaddr,
>  
>  /* 7. Determine the real address from the PTE */
>  
> -raddr = deposit64(pte.pte1 & HPTE64_R_RPN, 0, slb->sps-
> >page_shift, eaddr);
> +raddr = deposit64(pte.pte1 & HPTE64_R_RPN, 0, apshift, eaddr);
>  
>  tlb_set_page(cs, eaddr & TARGET_PAGE_MASK, raddr &
> TARGET_PAGE_MASK,
> - prot, mmu_idx, TARGET_PAGE_SIZE);
> + prot, mmu_idx, 1ULL << apshift);
>  
>  return 0;
>  }
> @@ -609,6 +660,7 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU
> *cpu, target_ulong addr)
>  ppc_slb_t *slb;
>  hwaddr pte_offset;
>  ppc_hash_pte64_t pte;
> +unsigned apshift;
>  
>  if (msr_dr == 0) {
>  /* In real mode the top 4 effective address bits are ignored
> */
> @@ -625,7 +677,12 @@ hwaddr ppc_hash64_get_phys_page_debug(PowerPCCPU
> *cpu, target_ulong addr)
>  return -1;
>  }
>  
> -return 

Re: [Qemu-devel] [PATCHv2 10/10] target-ppc: Allow more page sizes for POWER7 & POWER8 in TCG

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> Now that the TCG and spapr code has been extended to allow (semi-)
> arbitrary page encodings in the CPU's 'sps' table, we can add the
> many
> page sizes supported by real POWER7 and POWER8 hardware that we
> previously
> didn't support in TCG.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

> ---
>  target-ppc/mmu-hash64.h |  2 ++
>  target-ppc/translate_init.c | 32 
>  2 files changed, 34 insertions(+)
> 
> diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
> index 34cf975..ab0f86b 100644
> --- a/target-ppc/mmu-hash64.h
> +++ b/target-ppc/mmu-hash64.h
> @@ -48,6 +48,8 @@ unsigned
> ppc_hash64_hpte_page_shift_noslb(PowerPCCPU *cpu,
>  #define SLB_VSID_LLP_MASK   (SLB_VSID_L | SLB_VSID_LP)
>  #define SLB_VSID_4K 0xULL
>  #define SLB_VSID_64K0x0110ULL
> +#define SLB_VSID_16M0x0100ULL
> +#define SLB_VSID_16G0x0120ULL
>  
>  /*
>   * Hash page table definitions
> diff --git a/target-ppc/translate_init.c b/target-
> ppc/translate_init.c
> index f6babd2..32b3679 100644
> --- a/target-ppc/translate_init.c
> +++ b/target-ppc/translate_init.c
> @@ -8104,6 +8104,36 @@ static Property powerpc_servercpu_properties[]
> = {
>  DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +#ifdef CONFIG_SOFTMMU
> +static const struct ppc_segment_page_sizes POWER7_POWER8_sps = {
> +.sps = {
> +{
> +.page_shift = 12, /* 4K */
> +.slb_enc = 0,
> +.enc = { { .page_shift = 12, .pte_enc = 0 },
> + { .page_shift = 16, .pte_enc = 0x7 },
> + { .page_shift = 24, .pte_enc = 0x38 }, },
> +},
> +{
> +.page_shift = 16, /* 64K */
> +.slb_enc = SLB_VSID_64K,
> +.enc = { { .page_shift = 16, .pte_enc = 0x1 },
> + { .page_shift = 24, .pte_enc = 0x8 }, },
> +},
> +{
> +.page_shift = 24, /* 16M */
> +.slb_enc = SLB_VSID_16M,
> +.enc = { { .page_shift = 24, .pte_enc = 0 }, },
> +},
> +{
> +.page_shift = 34, /* 16G */
> +.slb_enc = SLB_VSID_16G,
> +.enc = { { .page_shift = 34, .pte_enc = 0x3 }, },
> +},
> +}
> +};
>
> +#endif /* CONFIG_SOFTMMU */
> +
>  static void init_proc_POWER7 (CPUPPCState *env)
>  {
>  init_proc_book3s_64(env, BOOK3S_CPU_POWER7);
> @@ -8167,6 +8197,7 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void
> *data)
>  pcc->mmu_model = POWERPC_MMU_2_06;
>  #if defined(CONFIG_SOFTMMU)
>  pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault;
> +pcc->sps = _POWER8_sps;
>  #endif
>  pcc->excp_model = POWERPC_EXCP_POWER7;
>  pcc->bus_model = PPC_FLAGS_INPUT_POWER7;
> @@ -8247,6 +8278,7 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void
> *data)
>  pcc->mmu_model = POWERPC_MMU_2_07;
>  #if defined(CONFIG_SOFTMMU)
>  pcc->handle_mmu_fault = ppc_hash64_handle_mmu_fault;
> +pcc->sps = _POWER8_sps;
>  #endif
>  pcc->excp_model = POWERPC_EXCP_POWER7;
>  pcc->bus_model = PPC_FLAGS_INPUT_POWER7;


Re: [Qemu-devel] [PATCH v7 1/4] firmware: introduce sysfs driver for QEMU's fw_cfg device

2016-01-27 Thread kbuild test robot
Hi Gabriel,

[auto build test WARNING on driver-core/driver-core-testing]
[also build test WARNING on v4.5-rc1 next-20160127]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improving the system]

url:
https://github.com/0day-ci/linux/commits/Gabriel-L-Somlo/SysFS-driver-for-QEMU-fw_cfg-device/20160128-111609
config: ia64-allyesconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=ia64 

All warnings (new ones prefixed by >>):

>> drivers/firmware/qemu_fw_cfg.c:112:4: warning: #warning "QEMU FW_CFG may not 
>> be available on this architecture!" [-Wcpp]
#  warning "QEMU FW_CFG may not be available on this architecture!"
   ^

vim +112 drivers/firmware/qemu_fw_cfg.c

96  release_region(fw_cfg_p_base, fw_cfg_p_size);
97  }
98  }
99  
   100  /* arch-specific ctrl & data register offsets are not available in 
ACPI, DT */
   101  #if !(defined(FW_CFG_CTRL_OFF) && defined(FW_CTRL_DATA_OFF))
   102  # if (defined(CONFIG_ARM) || defined(CONFIG_ARM64))
   103  #  define FW_CFG_CTRL_OFF 0x08
   104  #  define FW_CFG_DATA_OFF 0x00
   105  # elif (defined(CONFIG_PPC_PMAC) || defined(CONFIG_SPARC32)) /* 
ppc/mac,sun4m */
   106  #  define FW_CFG_CTRL_OFF 0x00
   107  #  define FW_CFG_DATA_OFF 0x02
   108  # elif (defined(CONFIG_X86) || defined(CONFIG_SPARC64)) /* x86, sun4u */
   109  #  define FW_CFG_CTRL_OFF 0x00
   110  #  define FW_CFG_DATA_OFF 0x01
   111  # else
 > 112  #  warning "QEMU FW_CFG may not be available on this architecture!"
   113  #  define FW_CFG_CTRL_OFF 0x00
   114  #  define FW_CFG_DATA_OFF 0x01
   115  # endif
   116  #endif
   117  
   118  /* initialize fw_cfg device i/o from platform data */
   119  static int fw_cfg_do_platform_probe(struct platform_device *pdev)
   120  {

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data


Re: [Qemu-devel] [PATCH V2] net/traffic-mirror:Add traffic-mirror

2016-01-27 Thread Jason Wang


On 01/27/2016 10:40 AM, Zhang Chen wrote:
> From: ZhangChen 
>
> Traffic-mirror is a netfilter plugin.
> It gives qemu the ability to copy and mirror guest's
> net packet. we output packet to chardev.
>
> usage:
>
> -netdev tap,id=hn0
> -chardev socket,id=mirror0,host=ip_primary,port=X,server,nowait
> -traffic-mirror,id=m0,netdev=hn0,queue=tx/rx/all,outdev=mirror0
>
> Signed-off-by: ZhangChen 
> Signed-off-by: Wen Congyang 
> Reviewed-by: Yang Hongyang 

Thanks for the patch. Several questions:

- I'm curious about how the patch was tested? Simple setup e.g:

-netdev tap,id=hn0 -device virtio-net-pci,netdev=hn0 -chardev
socket,id=c0,host=localhost,port=,server,nowait -object
traffic-mirror,netdev=hn0,outdev=c0,id=f0 -netdev
socket,id=s0,connect=127.0.0.1: -device e1000,netdev=s0

does not works for me.

- Is a reliable mirroring (e.g no packet drops during mirroring) is
needed for COLO? If yes, this patch seems could not guarantee this.
- Please consider to write a unit test for this patch.

And see comments below.

Thanks


> ---
>  net/Makefile.objs|   1 +
>  net/traffic-mirror.c | 173 
> +++
>  qemu-options.hx  |   5 ++
>  vl.c |   3 +-
>  4 files changed, 181 insertions(+), 1 deletion(-)
>  create mode 100644 net/traffic-mirror.c
>
> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index 5fa2f97..de06ebe 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -15,3 +15,4 @@ common-obj-$(CONFIG_VDE) += vde.o
>  common-obj-$(CONFIG_NETMAP) += netmap.o
>  common-obj-y += filter.o
>  common-obj-y += filter-buffer.o
> +common-obj-y += traffic-mirror.o

Let's s/traffic-mirror/filter-mirror/g to be consistent with other filters.

> diff --git a/net/traffic-mirror.c b/net/traffic-mirror.c
> new file mode 100644
> index 000..bed915c
> --- /dev/null
> +++ b/net/traffic-mirror.c
> @@ -0,0 +1,173 @@
> +/*
> + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
> + * Copyright (c) 2016 FUJITSU LIMITED
> + * Copyright (c) 2016 Intel Corporation
> + *
> + * Author: Zhang Chen 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include "net/filter.h"
> +#include "net/net.h"
> +#include "qemu-common.h"
> +#include "qapi/qmp/qerror.h"
> +#include "qapi-visit.h"
> +#include "qom/object.h"
> +#include "qemu/main-loop.h"
> +#include "qemu/error-report.h"
> +#include "trace.h"
> +#include "sysemu/char.h"
> +#include "qemu/iov.h"
> +
> +#define FILTER_TRAFFIC_MIRROR(obj) \
> +OBJECT_CHECK(MirrorState, (obj), TYPE_FILTER_TRAFFIC_MIRROR)
> +
> +#define TYPE_FILTER_TRAFFIC_MIRROR "traffic-mirror"
> +
> +typedef struct MirrorState {
> +NetFilterState parent_obj;
> +char *outdev;
> +CharDriverState *chr_out;
> +
> +} MirrorState;
> +
> +static ssize_t traffic_mirror_send(NetFilterState *nf,
> +   const struct iovec *iov,
> +   int iovcnt)
> +{
> +MirrorState *s = FILTER_TRAFFIC_MIRROR(nf);
> +ssize_t ret = 0;
> +ssize_t size = 0;
> +char *buf;
> +
> +size = iov_size(iov, iovcnt);
> +if (!size) {
> +return 0;
> +}
> +
> +buf = g_malloc0(size);
> +iov_to_buf(iov, iovcnt, 0, buf, size);
> +ret = qemu_chr_fe_write(s->chr_out, (uint8_t *), sizeof(size));

htonl(size)?

> +if (ret < 0) {

This check is not sufficient, for some reason, only part of the packets
maybe sent by the socket. Need to handle this properly, otherwise it may
confuse receiver.

> +g_free(buf);
> +return ret;
> +}
> +
> +ret = qemu_chr_fe_write(s->chr_out, (uint8_t *)buf, size);
> +g_free(buf);
> +return ret;

Ditto.

> +}
> +
> +static ssize_t traffic_mirror_receive_iov(NetFilterState *nf,
> + NetClientState *sender,
> + unsigned flags,
> + const struct iovec *iov,
> + int iovcnt,
> + NetPacketSent *sent_cb)
> +{
> +/*
> + * We copy and mirror packet to outdev,
> + * then put back the packet.
> + */

The code could explain itself, so the comment is unnecessary.

> +ssize_t ret = 0;
> +
> +ret = traffic_mirror_send(nf, iov, iovcnt);
> +if (ret < 0) {
> +error_report("traffic_mirror_send failed");

Monitor could be flooded by this.

> +}
> +
> +return 0;
> +}
> +

Other looks good.




Re: [Qemu-devel] [PATCH v8 07/16] block: Remove BDS close notifier

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> It is unused now, so we can remove it.
> 
> Signed-off-by: Max Reitz 
> ---
>  block.c| 8 
>  block/block-backend.c  | 7 ---
>  include/block/block.h  | 1 -
>  include/block/block_int.h  | 2 --
>  include/sysemu/block-backend.h | 1 -
>  5 files changed, 19 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 9a31e20..a6da333 100644
> --- a/block.c
> +++ b/block.c
> @@ -259,7 +259,6 @@ BlockDriverState *bdrv_new(void)
>  for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
>  QLIST_INIT(>op_blockers[i]);
>  }
> -notifier_list_init(>close_notifiers);
>  notifier_with_return_list_init(>before_write_notifiers);
>  qemu_co_queue_init(>throttled_reqs[0]);
>  qemu_co_queue_init(>throttled_reqs[1]);
> @@ -269,11 +268,6 @@ BlockDriverState *bdrv_new(void)
>  return bs;
>  }
>  
> -void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify)
> -{
> -notifier_list_add(>close_notifiers, notify);
> -}
> -
>  BlockDriver *bdrv_find_format(const char *format_name)
>  {
>  BlockDriver *drv1;
> @@ -2157,8 +2151,6 @@ void bdrv_close(BlockDriverState *bs)
>  bdrv_flush(bs);
>  bdrv_drain(bs); /* in case flush left pending I/O */
>  
> -notifier_list_notify(>close_notifiers, bs);
> -
>  bdrv_release_all_dirty_bitmaps(bs);
>  
>  if (bs->blk) {
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 1872191..621787c 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1146,13 +1146,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, 
> Notifier *notify)
>  notifier_list_add(>insert_bs_notifiers, notify);
>  }
>  
> -void blk_add_close_notifier(BlockBackend *blk, Notifier *notify)
> -{
> -if (blk->bs) {
> -bdrv_add_close_notifier(blk->bs, notify);
> -}
> -}
> -
>  void blk_io_plug(BlockBackend *blk)
>  {
>  if (blk->bs) {
> diff --git a/include/block/block.h b/include/block/block.h
> index 25f36dc..c7345de 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -226,7 +226,6 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
>  void bdrv_reopen_commit(BDRVReopenState *reopen_state);
>  void bdrv_reopen_abort(BDRVReopenState *reopen_state);
>  void bdrv_close(BlockDriverState *bs);
> -void bdrv_add_close_notifier(BlockDriverState *bs, Notifier *notify);
>  int bdrv_read(BlockDriverState *bs, int64_t sector_num,
>uint8_t *buf, int nb_sectors);
>  int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index ec31df1..8730cf6 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -403,8 +403,6 @@ struct BlockDriverState {
>  BdrvChild *backing;
>  BdrvChild *file;
>  
> -NotifierList close_notifiers;
> -
>  /* Callback before write request is processed */
>  NotifierWithReturnList before_write_notifiers;
>  
> diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
> index e12be67..ae4efb4 100644
> --- a/include/sysemu/block-backend.h
> +++ b/include/sysemu/block-backend.h
> @@ -166,7 +166,6 @@ void blk_remove_aio_context_notifier(BlockBackend *blk,
>   void *opaque);
>  void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify);
>  void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify);
> -void blk_add_close_notifier(BlockBackend *blk, Notifier *notify);
>  void blk_io_plug(BlockBackend *blk);
>  void blk_io_unplug(BlockBackend *blk);
>  BlockAcctStats *blk_get_stats(BlockBackend *blk);
> -- 
> 2.7.0
> 

Reviewed-by: Fam Zheng 



Re: [Qemu-devel] [PATCH v8 06/16] nbd: Switch from close to eject notifier

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> The NBD code uses the BDS close notifier to determine when a medium is
> ejected. However, now it should use the BB's BDS removal notifier for
> that instead of the BDS's close notifier.
> 
> Signed-off-by: Max Reitz 
> ---
>  blockdev-nbd.c | 40 +---
>  nbd/server.c   | 13 +
>  2 files changed, 18 insertions(+), 35 deletions(-)
> 
> diff --git a/blockdev-nbd.c b/blockdev-nbd.c
> index 4a758ac..9d6a21c 100644
> --- a/blockdev-nbd.c
> +++ b/blockdev-nbd.c
> @@ -45,37 +45,11 @@ void qmp_nbd_server_start(SocketAddress *addr, Error 
> **errp)
>  }
>  }
>  
> -/*
> - * Hook into the BlockBackend notifiers to close the export when the
> - * backend is closed.
> - */
> -typedef struct NBDCloseNotifier {
> -Notifier n;
> -NBDExport *exp;
> -QTAILQ_ENTRY(NBDCloseNotifier) next;
> -} NBDCloseNotifier;
> -
> -static QTAILQ_HEAD(, NBDCloseNotifier) close_notifiers =
> -QTAILQ_HEAD_INITIALIZER(close_notifiers);
> -
> -static void nbd_close_notifier(Notifier *n, void *data)
> -{
> -NBDCloseNotifier *cn = DO_UPCAST(NBDCloseNotifier, n, n);
> -
> -notifier_remove(>n);
> -QTAILQ_REMOVE(_notifiers, cn, next);
> -
> -nbd_export_close(cn->exp);
> -nbd_export_put(cn->exp);
> -g_free(cn);
> -}
> -
>  void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
>  Error **errp)
>  {
>  BlockBackend *blk;
>  NBDExport *exp;
> -NBDCloseNotifier *n;
>  
>  if (server_fd == -1) {
>  error_setg(errp, "NBD server not running");
> @@ -113,19 +87,15 @@ void qmp_nbd_server_add(const char *device, bool 
> has_writable, bool writable,
>  
>  nbd_export_set_name(exp, device);
>  
> -n = g_new0(NBDCloseNotifier, 1);
> -n->n.notify = nbd_close_notifier;
> -n->exp = exp;
> -blk_add_close_notifier(blk, >n);
> -QTAILQ_INSERT_TAIL(_notifiers, n, next);
> +/* The list of named exports has a strong reference to this export now 
> and
> + * our only way of accessing it is through nbd_export_find(), so we can 
> drop
> + * the strong reference that is @exp. */

Not quite sure about the meaning of "the strong reference that is @exp", I
guess you mean the one reference born in nbd_export_new(), which would match
the code.  Other than this,

Reviewed-by: Fam Zheng 



Re: [Qemu-devel] [PATCH v8 12/16] blockdev: Keep track of monitor-owned BDS

2016-01-27 Thread Fam Zheng
On Wed, 01/27 18:59, Max Reitz wrote:
> Signed-off-by: Max Reitz 
> ---
>  blockdev.c | 26 ++
>  include/block/block_int.h  |  4 
>  stubs/Makefile.objs|  1 +
>  stubs/blockdev-close-all-bdrv-states.c |  5 +
>  4 files changed, 36 insertions(+)
>  create mode 100644 stubs/blockdev-close-all-bdrv-states.c
> 
> diff --git a/blockdev.c b/blockdev.c
> index 09d4621..ac93f43 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -50,6 +50,9 @@
>  #include "trace.h"
>  #include "sysemu/arch_init.h"
>  
> +static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
> +QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
> +
>  static const char *const if_name[IF_COUNT] = {
>  [IF_NONE] = "none",
>  [IF_IDE] = "ide",
> @@ -702,6 +705,19 @@ fail:
>  return NULL;
>  }
>  
> +void blockdev_close_all_bdrv_states(void)
> +{
> +BlockDriverState *bs, *next_bs;
> +
> +QTAILQ_FOREACH_SAFE(bs, _bdrv_states, monitor_list, next_bs) {
> +AioContext *ctx = bdrv_get_aio_context(bs);
> +
> +aio_context_acquire(ctx);
> +bdrv_unref(bs);
> +aio_context_release(ctx);
> +}
> +}
> +
>  static void qemu_opt_rename(QemuOpts *opts, const char *from, const char *to,
>  Error **errp)
>  {
> @@ -3875,12 +3891,15 @@ void qmp_blockdev_add(BlockdevOptions *options, Error 
> **errp)
>  if (!bs) {
>  goto fail;
>  }
> +
> +QTAILQ_INSERT_TAIL(_bdrv_states, bs, monitor_list);
>  }
>  
>  if (bs && bdrv_key_required(bs)) {
>  if (blk) {
>  blk_unref(blk);
>  } else {
> +QTAILQ_REMOVE(_bdrv_states, bs, monitor_list);
>  bdrv_unref(bs);
>  }
>  error_setg(errp, "blockdev-add doesn't support encrypted devices");
> @@ -3945,11 +3964,18 @@ void qmp_x_blockdev_del(bool has_id, const char *id,
> bdrv_get_device_or_node_name(bs));
>  goto out;
>  }
> +
> +if (!blk && !bs->monitor_list.tqe_prev) {
> +error_setg(errp, "Node %s is not owned by the monitor",
> +   bs->node_name);
> +goto out;
> +}

Is this an extra restriction added by this patch? Deserve some words in the
commit message?

>  }
>  
>  if (blk) {
>  blk_unref(blk);
>  } else {
> +QTAILQ_REMOVE(_bdrv_states, bs, monitor_list);
>  bdrv_unref(bs);
>  }
>  
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 1e4c518..dd00d12 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -445,6 +445,8 @@ struct BlockDriverState {
>  QTAILQ_ENTRY(BlockDriverState) device_list;
>  /* element of the list of all BlockDriverStates (all_bdrv_states) */
>  QTAILQ_ENTRY(BlockDriverState) bs_list;
> +/* element of the list of monitor-owned BDS */
> +QTAILQ_ENTRY(BlockDriverState) monitor_list;
>  QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
>  int refcnt;
>  
> @@ -707,4 +709,6 @@ bool bdrv_requests_pending(BlockDriverState *bs);
>  void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
>  void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
>  
> +void blockdev_close_all_bdrv_states(void);
> +
>  #endif /* BLOCK_INT_H */
> diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
> index d7898a0..e922de9 100644
> --- a/stubs/Makefile.objs
> +++ b/stubs/Makefile.objs
> @@ -1,5 +1,6 @@
>  stub-obj-y += arch-query-cpu-def.o
>  stub-obj-y += bdrv-commit-all.o
> +stub-obj-y += blockdev-close-all-bdrv-states.o
>  stub-obj-y += clock-warp.o
>  stub-obj-y += cpu-get-clock.o
>  stub-obj-y += cpu-get-icount.o
> diff --git a/stubs/blockdev-close-all-bdrv-states.c 
> b/stubs/blockdev-close-all-bdrv-states.c
> new file mode 100644
> index 000..12d2442
> --- /dev/null
> +++ b/stubs/blockdev-close-all-bdrv-states.c
> @@ -0,0 +1,5 @@
> +#include "block/block_int.h"
> +
> +void blockdev_close_all_bdrv_states(void)
> +{
> +}
> -- 
> 2.7.0
> 



[Qemu-devel] [PATCH 1/1] Protect stderr from non-blocking mode

2016-01-27 Thread Sam Bobroff
On Linux, if QEMU is run from a shell with -d to enable debug logging
but without directing it to a file (e.g. -D is not used, and
qemu_logfile is set to stderr), and no shell redirection is used, it
is possible for log messages to be lost under load.

This is caused by a combination of several factors:

* The shell (e.g. bash) may provide stdin, stdout and stderr as
  duplicates of a single open file, so they share file status flags
  including O_NONBLOCK.

* As character devices are registered (see qemu_chr_open_stdio()),
  stdin and stdout are set non-blocking.

* The printf() family of functions, including fprintf(), are not
  "non-blocking" aware: if they receive EAGAIN after partially
  writing their output, they immediately return EAGAIN with no way to
  discover how much output was written, if any.

So O_NONBLOCK is set on stdin/stdout, which causes O_NONBLOCK to be
set on stderr, and qemu_logfile is set to stderr. Then, under load,
fprintf()s to qemu_logfile return EAGAIN, which is ignored and data is
lost. This can't be fixed by handling EAGAIN because an unknown amount
of data has been written.

This patch works around the issue by re-opening the underlying tty
file, which is available in Linux as /proc/self/fd/2, and duplicating
it into fd 2 which causes stderr to refer to a new, unshared, "file
description". Additionally, if we end up logging to a non-blocking
file descriptor on a non-Linux platform, where no workaround has been
implemented, we display a warning message.

Signed-off-by: Sam Bobroff 
---
This is somewhat of an RFC as the issue probably hasn't been noticed
very often (ever?) and there are several approaches to addressing it.
The discussion was a while ago, here:

http://lists.nongnu.org/archive/html/qemu-devel/2015-12/msg00761.html

 include/sysemu/os-posix.h |  2 +-
 include/sysemu/os-win32.h |  2 +-
 os-posix.c| 30 +-
 os-win32.c|  2 +-
 vl.c  |  2 +-
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index f131521..7c7a36b 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -28,7 +28,7 @@
 
 #include 
 
-void os_set_line_buffering(void);
+void os_setup_stdio(void);
 void os_set_proc_name(const char *s);
 void os_setup_signal_handling(void);
 void os_daemonize(void);
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 400e098..b55c5d4 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -84,7 +84,7 @@ struct tm *localtime_r(const time_t *timep, struct tm 
*result);
 static inline void os_setup_signal_handling(void) {}
 static inline void os_daemonize(void) {}
 static inline void os_setup_post(void) {}
-void os_set_line_buffering(void);
+void os_setup_stdio(void);
 static inline void os_set_proc_name(const char *dummy) {}
 
 int getpagesize(void);
diff --git a/os-posix.c b/os-posix.c
index e4da406..d9afa6d 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -40,6 +40,7 @@
 #include "net/slirp.h"
 #include "qemu-options.h"
 #include "qemu/rcu.h"
+#include "qemu/log.h"
 
 #ifdef CONFIG_LINUX
 #include 
@@ -256,6 +257,17 @@ void os_setup_post(void)
 {
 int fd = 0;
 
+if (qemu_logfile == stderr) {
+if (fcntl(2, F_GETFL) & O_NONBLOCK) {
+/* See os_setup_stdio() for an explanation. */
+fprintf(stderr,
+"Note: stderr has been set non-blocking which can cause log messages to be\n"
+"lost under load. This can be avoided by directing log messages to a file\n"
+"(using -D) or by redirecting stderr using your shell.\n"
+);
+}
+}
+
 if (daemonize) {
 if (chdir("/")) {
 perror("not able to chdir to /");
@@ -289,8 +301,24 @@ void os_setup_post(void)
 }
 }
 
-void os_set_line_buffering(void)
+void os_setup_stdio(void)
 {
+#if defined(__linux__)
+/* In some situations (e.g. running QEMU from a bash command line and not using
+ * any redirection) the shell may have set up stdin, stdout and stderr as
+ * duplicates of a single open file (i.e. they share a "file description"). If
+ * this is the case then later when char drivers are registered and stdin and
+ * stdout are set non-blocking, stderr will also become non-blocking. Because
+ * printf (and variants) are not "non-blocking aware" this can cause debugging
+ * information to be lost (typically when using "-d" without "-D" and producing
+ * a lot of debug output). To avoid this, we re-open stderr to create a
+ * separate "file description" with it's own file status flags. */
+int fd = open("/proc/self/fd/2", O_WRONLY);
+if (fd != -1) {
+dup2(fd, 2);
+close(fd);
+}
+#endif
 setvbuf(stdout, NULL, _IOLBF, 0);
 }
 
diff --git a/os-win32.c b/os-win32.c
index cc09196..2fcbc72 100644
--- a/os-win32.c
+++ b/os-win32.c
@@ -88,7 +88,7 @@ char *os_find_datadir(void)
 return qemu_get_exec_dir();
 }
 

Re: [Qemu-devel] [PATCHv2 07/10] target-ppc: Split 44x tlbiva from ppc_tlb_invalidate_one()

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> Currently both the tlbiva instruction (used on 44x chips) and the
> tlbie
> instruction (used on hash MMU chips) are both handled via
> ppc_tlb_invalidate_one().  This is silly, because they're invoked
> from
> different places, and do different things.
> 
> Clean this up by separating out the tlbiva instruction into its own
> handling.  In fact the implementation is only a stub anyway.
> 
> Signed-off-by: David Gibson 

Acked-by: Benjamin Herrenschmidt 

> ---
>  target-ppc/helper.h |  1 +
>  target-ppc/mmu_helper.c | 14 ++
>  target-ppc/translate.c  |  2 +-
>  3 files changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> index 869be15..e5a8f7b 100644
> --- a/target-ppc/helper.h
> +++ b/target-ppc/helper.h
> @@ -544,6 +544,7 @@ DEF_HELPER_2(74xx_tlbd, void, env, tl)
>  DEF_HELPER_2(74xx_tlbi, void, env, tl)
>  DEF_HELPER_FLAGS_1(tlbia, TCG_CALL_NO_RWG, void, env)
>  DEF_HELPER_FLAGS_2(tlbie, TCG_CALL_NO_RWG, void, env, tl)
> +DEF_HELPER_FLAGS_2(tlbiva, TCG_CALL_NO_RWG, void, env, tl)
>  #if defined(TARGET_PPC64)
>  DEF_HELPER_FLAGS_3(store_slb, TCG_CALL_NO_RWG, void, env, tl, tl)
>  DEF_HELPER_2(load_slb_esid, tl, env, tl)
> diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> index 82ebe5d..e9e0edb 100644
> --- a/target-ppc/mmu_helper.c
> +++ b/target-ppc/mmu_helper.c
> @@ -1971,10 +1971,6 @@ void ppc_tlb_invalidate_one(CPUPPCState *env,
> target_ulong addr)
>  ppc6xx_tlb_invalidate_virt(env, addr, 1);
>  }
>  break;
> -case POWERPC_MMU_BOOKE:
> -/* XXX: TODO */
> -cpu_abort(CPU(cpu), "BookE MMU model is not implemented\n");
> -break;
>  case POWERPC_MMU_32B:
>  case POWERPC_MMU_601:
>  /* tlbie invalidate TLBs for all segments */
> @@ -2116,6 +2112,16 @@ void helper_tlbie(CPUPPCState *env,
> target_ulong addr)
>  ppc_tlb_invalidate_one(env, addr);
>  }
>  
> +void helper_tlbiva(CPUPPCState *env, target_ulong addr)
> +{
> +PowerPCCPU *cpu = ppc_env_get_cpu(env);
> +
> +/* tlbiva instruciton only exists on BookE */
> +assert(env->mmu_model == POWERPC_MMU_BOOKE);
> +/* XXX: TODO */
> +cpu_abort(CPU(cpu), "BookE MMU model is not implemented\n");
> +}
> +
>  /* Software driven TLBs management */
>  /* PowerPC 602/603 software TLB load instructions helpers */
>  static void do_6xx_tlb(CPUPPCState *env, target_ulong new_EPN, int
> is_code)
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index 4be7eaa..a05a169 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -5904,7 +5904,7 @@ static void gen_tlbiva(DisasContext *ctx)
>  }
>  t0 = tcg_temp_new();
>  gen_addr_reg_index(ctx, t0);
> -gen_helper_tlbie(cpu_env, cpu_gpr[rB(ctx->opcode)]);
> +gen_helper_tlbiva(cpu_env, cpu_gpr[rB(ctx->opcode)]);
>  tcg_temp_free(t0);
>  #endif
>  }



Re: [Qemu-devel] [PATCHv2 06/10] target-ppc: Remove unused mmu models from ppc_tlb_invalidate_one

2016-01-27 Thread Benjamin Herrenschmidt
On Wed, 2016-01-27 at 21:13 +1100, David Gibson wrote:
> ppc_tlb_invalidate_one() has a big switch handling many different MMU
> types.  However, most of those branches can never be reached:
> 
> It is called from 3 places: from remove_hpte() and h_protect() in
> spapr_hcall.c (which always has a 64-bit hash MMU type), and from
> helper_tlbie() in mmu_helper.c.
> 
> Calls to helper_tlbie() are generated from gen_tlbiel, gen_tlbiel and
> gen_tlbiva.  The first two are only used with the PPC_MEM_TLBIE flag,
> set only with 32-bit or 64-bit hash MMU models, and gen_tlbiva() is
> used only on 440 and 460 models with the BookE mmu model.
> 
> These means the exhaustive list of MMU types which may call
> ppc_tlb_invalidate_one() is: POWERPC_MMU_SOFT_6xx, POWERPC_MMU_601,
> POWERPC_MMU_32B, POWERPC_MMU_SOFT_74xx, POWERPC_MMU_64B,
> POWERPC_MMU_2_03,
> POWERPC_MMU_2_06, POWERPC_MMU_2_07 and POWERPC_MMU_BOOKE.
> 
> Clean up by removing logic for all other MMU types from
> ppc_tlb_invalidate_one().

I would argue to move hash64 out of it as well anyway. First what we do
in there is dumb, but the way I change it with lazy inval differs and
tlbie does provide additional information on server processors that
we would need should we chose to implemented fine grained invalidations
(such as the page size).

In the meantime:

Acked-by: Benjamin Herrenschmidt 

> Signed-off-by: David Gibson 
> ---
>  target-ppc/mmu_helper.c | 20 ++--
>  1 file changed, 2 insertions(+), 18 deletions(-)
> 
> diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> index c040b17..82ebe5d 100644
> --- a/target-ppc/mmu_helper.c
> +++ b/target-ppc/mmu_helper.c
> @@ -1971,25 +1971,10 @@ void ppc_tlb_invalidate_one(CPUPPCState *env,
> target_ulong addr)
>  ppc6xx_tlb_invalidate_virt(env, addr, 1);
>  }
>  break;
> -case POWERPC_MMU_SOFT_4xx:
> -case POWERPC_MMU_SOFT_4xx_Z:
> -ppc4xx_tlb_invalidate_virt(env, addr, env-
> >spr[SPR_40x_PID]);
> -break;
> -case POWERPC_MMU_REAL:
> -cpu_abort(CPU(cpu), "No TLB for PowerPC 4xx in real
> mode\n");
> -break;
> -case POWERPC_MMU_MPC8xx:
> -/* XXX: TODO */
> -cpu_abort(CPU(cpu), "MPC8xx MMU model is not
> implemented\n");
> -break;
>  case POWERPC_MMU_BOOKE:
>  /* XXX: TODO */
>  cpu_abort(CPU(cpu), "BookE MMU model is not implemented\n");
>  break;
> -case POWERPC_MMU_BOOKE206:
> -/* XXX: TODO */
> -cpu_abort(CPU(cpu), "BookE 2.06 MMU model is not
> implemented\n");
> -break;
>  case POWERPC_MMU_32B:
>  case POWERPC_MMU_601:
>  /* tlbie invalidate TLBs for all segments */
> @@ -2031,9 +2016,8 @@ void ppc_tlb_invalidate_one(CPUPPCState *env,
> target_ulong addr)
>  break;
>  #endif /* defined(TARGET_PPC64) */
>  default:
> -/* XXX: TODO */
> -cpu_abort(CPU(cpu), "Unknown MMU model\n");
> -break;
> +/* Should never reach here with other MMU models */
> +assert(0);
>  }
>  #else
>  ppc_tlb_invalidate_all(env);


Re: [Qemu-devel] [PATCHv2 07/10] target-ppc: Split 44x tlbiva from ppc_tlb_invalidate_one()

2016-01-27 Thread David Gibson
On Wed, Jan 27, 2016 at 06:58:43PM +0100, Laurent Vivier wrote:
> On 27/01/2016 11:13, David Gibson wrote:
> > Currently both the tlbiva instruction (used on 44x chips) and the tlbie
> > instruction (used on hash MMU chips) are both handled via
> > ppc_tlb_invalidate_one().  This is silly, because they're invoked from
> > different places, and do different things.
> > 
> > Clean this up by separating out the tlbiva instruction into its own
> > handling.  In fact the implementation is only a stub anyway.
> > 
> > Signed-off-by: David Gibson 
> > ---
> >  target-ppc/helper.h |  1 +
> >  target-ppc/mmu_helper.c | 14 ++
> >  target-ppc/translate.c  |  2 +-
> >  3 files changed, 12 insertions(+), 5 deletions(-)
> > 
> > diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> > index 869be15..e5a8f7b 100644
> > --- a/target-ppc/helper.h
> > +++ b/target-ppc/helper.h
> > @@ -544,6 +544,7 @@ DEF_HELPER_2(74xx_tlbd, void, env, tl)
> >  DEF_HELPER_2(74xx_tlbi, void, env, tl)
> >  DEF_HELPER_FLAGS_1(tlbia, TCG_CALL_NO_RWG, void, env)
> >  DEF_HELPER_FLAGS_2(tlbie, TCG_CALL_NO_RWG, void, env, tl)
> > +DEF_HELPER_FLAGS_2(tlbiva, TCG_CALL_NO_RWG, void, env, tl)
> >  #if defined(TARGET_PPC64)
> >  DEF_HELPER_FLAGS_3(store_slb, TCG_CALL_NO_RWG, void, env, tl, tl)
> >  DEF_HELPER_2(load_slb_esid, tl, env, tl)
> > diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
> > index 82ebe5d..e9e0edb 100644
> > --- a/target-ppc/mmu_helper.c
> > +++ b/target-ppc/mmu_helper.c
> > @@ -1971,10 +1971,6 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, 
> > target_ulong addr)
> >  ppc6xx_tlb_invalidate_virt(env, addr, 1);
> >  }
> >  break;
> > -case POWERPC_MMU_BOOKE:
> > -/* XXX: TODO */
> > -cpu_abort(CPU(cpu), "BookE MMU model is not implemented\n");
> > -break;
> >  case POWERPC_MMU_32B:
> >  case POWERPC_MMU_601:
> >  /* tlbie invalidate TLBs for all segments */
> > @@ -2116,6 +2112,16 @@ void helper_tlbie(CPUPPCState *env, target_ulong 
> > addr)
> >  ppc_tlb_invalidate_one(env, addr);
> >  }
> >  
> > +void helper_tlbiva(CPUPPCState *env, target_ulong addr)
> > +{
> > +PowerPCCPU *cpu = ppc_env_get_cpu(env);
> > +
> > +/* tlbiva instruciton only exists on BookE */
> 
> Typo here ^^

Corrected, thanks.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v3] Add optionrom compatible with fw_cfg DMA version

2016-01-27 Thread Kevin O'Connor
On Mon, Jan 25, 2016 at 02:17:48PM +0100, Marc Marí wrote:
> This optionrom is based on linuxboot.S.

Hi Marc,

Out of curiousity, how does the timing with this option rom compare to
the previous SeaBIOS patches that implemented linux dma loading?


When I first tried to compile this (on fc23), I got:

In file included from /usr/include/features.h:389:0,
 from /usr/include/stdint.h:25,
 from /usr/lib/gcc/x86_64-redhat-linux/5.3.1/include/stdint.h:9,
 from linuxboot_dma.c:62:
/usr/include/gnu/stubs.h:7:27: fatal error: gnu/stubs-32.h: No such file or 
directory
compilation terminated.

which I fixed by running "dnf install glibc-devel.i686".  Is a
configure check needed?


See further comments below.

[...]
> --- /dev/null
> +++ b/pc-bios/optionrom/linuxboot_dma.c
> @@ -0,0 +1,262 @@
> +/*
> + * Linux Boot Option ROM for fw_cfg DMA
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see .
> + *
> + * Copyright (c) 2015 Red Hat Inc.
> + *   Authors: Marc Marí 
> + */
> +
> +asm(
> +".text\n"
> +".global _start\n"
> +"_start:\n"
> +"   .short   0xaa55\n"
> +"   .byte (_end - _start) / 512\n"
> +"   lret\n"
> +"   .org 0x18\n"
> +"   .short 0\n"
> +"   .short _pnph\n"
> +"_pnph:\n"
> +"   .ascii \"$PnP\"\n"
> +"   .byte 0x01\n"
> +"   .byte ( _pnph_len / 16 )\n"
> +"   .short 0x\n"
> +"   .byte 0x00\n"
> +"   .byte 0x00\n"
> +"   .long 0x\n"
> +"   .short _manufacturer\n"
> +"   .short _product\n"
> +"   .long 0x\n"
> +"   .short 0x\n"
> +"   .short 0x\n"
> +"   .short _bev\n"
> +"   .short 0x\n"
> +"   .short 0x\n"
> +"   .equ _pnph_len, . - _pnph\n"
> +"   .align 4, 0\n"
> +"_bev:\n"
> +".code16gcc\n"
> +/* DS = CS */
> +"   movw %cs, %ax\n"
> +"   movw %ax, %ds\n"
> +"   movl %esp, %ebp\n"
> +"run_linuxboot:\n"
> +"   cli\n"
> +"   cld\n"
> +"   jmp load_kernel\n"
> +);

The run_linuxboot label doesn't seem to be used anywhere.

[...]
> +static inline uint16_t readw_addr32(const void *addr) {
> +uint16_t val;
> +asm("addr32 movw %1, %0" : "=r"(val) : "g"(addr));
> +barrier();
> +return val;
> +}
> +
> +static inline uint32_t readl_addr32(const void *addr) {
> +uint32_t val;
> +asm("addr32 movl %1, %0" : "=r"(val) : "g"(addr));
> +barrier();
> +return val;
> +}
> +
> +static inline void writel_addr32(void *addr, uint32_t val) {
> +barrier();
> +asm("addr32 movl %0, %1" : : "r"(val), "g"(addr));
> +}

The above does not look correct to me.  Since the code is running in
16bit mode the above memory accesses are relative to the %ds segment.
Because %ds=%cs this is going to access a different address than
expected.

What I think you want to do is assign %es=setup_addr>>4 and then
perform the read at the given offset (eg, 0x206).

[...]
> +static void bios_cfg_read_entry(void *buf, uint16_t entry, uint32_t len)
> +{
> +FWCfgDmaAccess access;
> +uint32_t control = (entry << 16) | BIOS_CFG_DMA_CTL_SELECT
> +| BIOS_CFG_DMA_CTL_READ;
> +
> +access.address = cpu_to_be64((uint64_t)(uint32_t)buf);
> +access.length = cpu_to_be32(len);
> +access.control = cpu_to_be32(control);
> +
> +barrier();
> +
> +outl(cpu_to_be32((uint32_t)), BIOS_CFG_DMA_ADDR_LOW);
> +
> +while(be32_to_cpu(access.control) & ~BIOS_CFG_DMA_CTL_ERROR) {
> +barrier();
> +}
> +}

FYI, I think with a small incremental patch (see below) one could
entirely replace the existing linuxboot.rom with your new code.

The one caveat is that this patch requires that kvm support "big real
mode" and I know there were quirks with that on some older Intel
chips.  However, I think the "insb" instruction would trap anyway, so
maybe it's not an issue.

-Kevin


--- a/pc-bios/optionrom/linuxboot_dma.c
+++ b/pc-bios/optionrom/linuxboot_dma.c
@@ -73,6 +73,8 @@ asm(
 #define BIOS_CFG_DMA_CTL_SKIP0x04
 #define BIOS_CFG_DMA_CTL_SELECT  0x08
 
+#define BIOS_CFG_CTL   0x510
+#define BIOS_CFG_DATA  0x511
 #define BIOS_CFG_DMA_ADDR_HIGH 0x514
 #define BIOS_CFG_DMA_ADDR_LOW  0x518
 
@@ -87,6 +89,16 @@ typedef struct FWCfgDmaAccess {
 uint64_t address;
 } __attribute__((packed)) FWCfgDmaAccess;
 
+static inline void outw(uint16_t value, uint16_t port) {
+asm("outw %w0, %w1" : : "a"(value), "Nd"(port));
+}
+
+static inline uint32_t 

Re: [Qemu-devel] [PATCH v14 7/8] Implement new driver for block replication

2016-01-27 Thread Wen Congyang
On 01/27/2016 10:46 PM, Stefan Hajnoczi wrote:
> On Wed, Jan 13, 2016 at 05:18:31PM +0800, Changlong Xie wrote:
>> From: Wen Congyang 
>>
>> Signed-off-by: Wen Congyang 
>> Signed-off-by: zhanghailiang 
>> Signed-off-by: Gonglei 
>> Signed-off-by: Changlong Xie 
>> ---
>>  block/Makefile.objs  |   1 +
>>  block/replication-comm.c |  66 +
>>  block/replication.c  | 590 
>> +++
>>  include/block/replication-comm.h |  50 
>>  qapi/block-core.json |  13 +
>>  5 files changed, 720 insertions(+)
>>  create mode 100644 block/replication-comm.c
>>  create mode 100644 block/replication.c
>>  create mode 100644 include/block/replication-comm.h
>>
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index fa05f37..7037662 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -23,6 +23,7 @@ block-obj-$(CONFIG_LIBSSH2) += ssh.o
>>  block-obj-y += accounting.o
>>  block-obj-y += write-threshold.o
>>  block-obj-y += backup.o
>> +block-obj-y += replication-comm.o replication.o
>>  
>>  common-obj-y += stream.o
>>  common-obj-y += commit.o
>> diff --git a/block/replication-comm.c b/block/replication-comm.c
>> new file mode 100644
>> index 000..8af748b
>> --- /dev/null
>> +++ b/block/replication-comm.c
>> @@ -0,0 +1,66 @@
>> +/*
>> + * Replication Block filter
> 
> Is the start/stop/checkpoint callback interface only useful for block
> replication?
> 
> This seems like a generic interface for registering with COLO.  Other
> components (networking, etc) might also need start/stop/checkpoint
> callbacks.  If that's the case then this code should be outside block/
> and the brs->bs field should either be void *opaque or removed (the
> caller needs to use container_of()).

Yes, we will do it in the next version.

> 
>> + *
>> + * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO., LTD.
>> + * Copyright (c) 2015 Intel Corporation
>> + * Copyright (c) 2015 FUJITSU LIMITED
>> + *
>> + * Author:
>> + *   Wen Congyang 
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "block/replication-comm.h"
>> +
>> +static QLIST_HEAD(, BlockReplicationState) block_replication_states;
>> +
>> +BlockReplicationState *block_replication_new(BlockDriverState *bs,
>> + BlockReplicationOps *ops)
>> +{
>> +BlockReplicationState *brs;
>> +
>> +brs = g_new0(BlockReplicationState, 1);
>> +brs->bs = bs;
>> +brs->ops = ops;
>> +QLIST_INSERT_HEAD(_replication_states, brs, node);
>> +
>> +return brs;
>> +}
>> +
>> +void block_replication_remove(BlockReplicationState *brs)
>> +{
>> +QLIST_REMOVE(brs, node);
>> +g_free(brs);
>> +}
>> +
>> +void block_replication_start_all(ReplicationMode mode, Error **errp)
>> +{
>> +BlockReplicationState *brs, *next;
>> +QLIST_FOREACH_SAFE(brs, _replication_states, node, next) {
>> +if (brs->ops && brs->ops->start) {
>> +brs->ops->start(brs, mode, errp);
>> +}
>> +}
>> +}
>> +
>> +void block_replication_do_checkpoint_all(Error **errp)
>> +{
>> +BlockReplicationState *brs, *next;
>> +QLIST_FOREACH_SAFE(brs, _replication_states, node, next) {
>> +if (brs->ops && brs->ops->checkpoint) {
>> +brs->ops->checkpoint(brs, errp);
>> +}
>> +}
>> +}
>> +
>> +void block_replication_stop_all(bool failover, Error **errp)
>> +{
>> +BlockReplicationState *brs, *next;
>> +QLIST_FOREACH_SAFE(brs, _replication_states, node, next) {
>> +if (brs->ops && brs->ops->stop) {
>> +brs->ops->stop(brs, failover, errp);
>> +}
>> +}
>> +}
>> diff --git a/block/replication.c b/block/replication.c
>> new file mode 100644
>> index 000..29c677a
>> --- /dev/null
>> +++ b/block/replication.c
>> @@ -0,0 +1,590 @@
>> +/*
>> + * Replication Block filter
>> + *
>> + * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO., LTD.
>> + * Copyright (c) 2015 Intel Corporation
>> + * Copyright (c) 2015 FUJITSU LIMITED
>> + *
>> + * Author:
>> + *   Wen Congyang 
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "block/blockjob.h"
>> +#include "block/nbd.h"
>> +#include "block/replication-comm.h"
>> +
>> +typedef struct BDRVReplicationState {
>> +ReplicationMode mode;
>> +int replication_state;
>> +BlockDriverState *active_disk;
>> +BlockDriverState *hidden_disk;
>> +BlockDriverState *secondary_disk;
>> +BlockDriverState *top_bs;
>> +BlockReplicationState *brs;
>> +Error *blocker;
>> +int orig_hidden_flags;
>> +

Re: [Qemu-devel] virtio-scsi/blk dataplane and guest memory allocation

2016-01-27 Thread Fam Zheng
On Wed, 01/27 21:03, Roy Shterman wrote:
> Hi,
> 
> First of all thank very much for your help,
> 
> Second, unfortunately data-plane didn't worked well, I tried to add threads
> from the instructions you gave me.
> 
> Here is my full xml file, maybe you can help me to understand why it didn't
> worked :

Did you specify iothread in the xml?

> 
> 
>   gen-r-vrt-105-007-RH7.0x64
>   8f79e97e-d452-4577-82bd-2ed903773026
>   2097152
>   2097152
>   
> 8388608
>   
>   
> 
>   
>   2
>   
> hvm
> 
>   
>   
> 
> 
> 
>   
>   
>   destroy
>   restart
>   restart

Add

1

>   
> 
> /.autodirect/mtrswgwork/roysh/git/qemu/x86_64-softmmu/qemu-system-x86_64
> 
>   

and change these lines to:



Fam

>   
>   
>function='0x0'/>
> 
> 
>   
>   
> 
>   
>   
>function='0x0'/>
> 
> 
>   
>   
> 
>   
>   
>   
> 
> 
>   
>   
> 
>   
>   
>   
> 
> 
>function='0x0'/>
> 
> 
>function='0x0'/>
> 
> 
>function='0x2'/>
> 
> 
> 
>   
>   
>   
>function='0x0'/>
> 
> 
>   
> 
> 
>   
> 
> 
> 
> 
> 
>   
>function='0x0'/>
> 
> 
>function='0x0'/>
> 
>   
> 
> 
> BTW, in RH 7.2 data-plane is default if one is choosing to work with virtio?
> 
> Thank you very much,
> Roy
> 
> On Thu, Jan 21, 2016 at 11:01 AM, Paolo Bonzini  wrote:
> 
> >
> >
> > On 20/01/2016 21:12, Roy Shterman wrote:
> > > Hi,
> > >
> > > I have two questions,
> > >
> > > First, I'm developing for Libiscsi and trying to work with virtio-scsi
> > > dataplane or even virtio-blk dataplane and it doesn't works well.
> > >
> > > I'm working with latest qemu and latest Libiscsi in RedHat 7 libvirt
> > > package.
> > >
> > > my iscsi xml part is :
> > >
> > > virtio-blk -
> > >
> > > 
> > >   
> > >   
> > > 
> > >   
> > >   
> > >> > function='0x0'/>
> > > 
> > >
> > > virtio-scsi -
> > >
> > > 
> > >   
> > >   
> > >   
> > >   
> > >   
> > > 
> > > 
> > >> > function='0x0'/>
> > > 
> >
> > There is now support for dataplane in libvirt.  See
> > https://libvirt.org/formatdomain.html#elementsIOThreadsAllocation and
> > then you can add an iothread='NN' (NN is a number) to the  > name='qemu' type='raw'/> element.
> >
> > > second thing, I'm trying to look for the code where QEMU allocate all
> > > guest memory (2 GB) in my case.
> >
> > Start at memory_allocate_system_memory; ultimately you'll reach
> > qemu_anon_ram_alloc which is basically an mmap.
> >
> > paolo
> >



Re: [Qemu-devel] [PATCH v9 30/37] qapi: Canonicalize missing object to :empty

2016-01-27 Thread Markus Armbruster
Markus Armbruster  writes:

> Markus Armbruster  writes:
>
>> Eric Blake  writes:
>>
>>> Now that we elide unnecessary visits of empty types, we can
>>> start using the special ':empty' type in more places.  By using
>>> the empty type as the base class of every explicit struct or
>>> union, and as the default data for any command or event, we can
>>> simplify later logic in qapi-{visit,commands,event} by merely
>>> checking whether the type is empty, without also having to worry
>>> whether a type was even supplied.

You rewrite a command's arg_type from None to ':empty', bit not its
ret_type.  Deepens the assymmetry between the two.

>>> Note that gen_object() in qapi-types still has to check for a
>>> base, because it is also called for alternates (which have no
>>> base).
>>
>> What about the one in gen_visit_struct()?
>>
>> if (base and not base.is_empty()) or members:
>> ret += mcgen('''
>> visit_type_%(c_name)s_fields(v, obj, );
>> ''',
>>  c_name=c_name(name))
>>
>>> No change to generated code.
>>>
>>> Signed-off-by: Eric Blake 
>>>
>>> ---
>>> v9: squash in more related changes
>>> v8: rebase to earlier changes
>>> v7: rebase to earlier changes
>>> v6: new patch
>>> ---
>>>  scripts/qapi-commands.py| 17 +++--
>>>  scripts/qapi-event.py   |  5 ++--
>>>  scripts/qapi-types.py   |  4 +--
>>>  scripts/qapi-visit.py   | 12 +
>>>  scripts/qapi.py | 25 +-
>>>  tests/qapi-schema/event-case.out|  2 +-
>>>  tests/qapi-schema/flat-union-empty.out  |  1 +
>>>  tests/qapi-schema/ident-with-escape.out |  1 +
>>>  tests/qapi-schema/indented-expr.out |  4 +--
>>>  tests/qapi-schema/qapi-schema-test.out  | 45 
>>> ++---
>>>  tests/qapi-schema/union-clash-data.out  |  2 ++
>>>  tests/qapi-schema/union-empty.out   |  1 +
>>>  12 files changed, 83 insertions(+), 36 deletions(-)
>
> Missing: update to qapi-introspect.py.  At least the expressions like
>
> arg_type or self._schema.the_empty_object_type
>
> need updating.

But so far not the ret_type or self._schema.the_empty_object_type.

[...]



Re: [Qemu-devel] [PATCH 00/37] clean include files to use osdep.h

2016-01-27 Thread Paolo Bonzini


On 26/01/2016 19:16, Peter Maydell wrote:
> This is a big fat patchset that touches nearly 800 files,
> but it's all automated use of scripts/clean-includes.
> This doesn't cover the whole tree, but it does get all
> of target-* and hw/ and some other bits.
> I split the patches up mostly roughly by entries in
> MAINTAINERS, and also with some more ad-hoc splitting
> at the end of the series. The final larger patch is
> the "everything else in hw" bit -- I stopped when I
> got down to a diffstat that wasn't too stupidly huge and
> there weren't any more obvious neat carve-outs.
> 
> I propose to apply this directly to master in the not too
> distant future, since the consensus appears to be that that's
> overall less painful than trying to split it all between
> maintainer trees for a purely mechanical change.
> 
> There will then be another similar series which picks up
> the remaining stuff.

Agreed, thanks for doing this!

Paolo



Re: [Qemu-devel] [RFC 0/10] Support Receive-Segment-Offload(RSC) for WHQL test of Window guest

2016-01-27 Thread Wei Xu



On 01/27/2016 12:52 PM, Jason Wang wrote:


On 01/26/2016 02:44 PM, Fam Zheng wrote:

On Tue, 01/26 06:24, w...@redhat.com wrote:

Wei Xu (10):
   'Segment', 'Chain' and 'Status' enumeration.
   Initilize & Cleanup.
   Chain lookup and packets caching.
   Tcp general data coalescing
   The draining timer
   IPv4 checksum.
   TCP control packet handling
   Sanity check & More bypass cases check.
   IPv6 support.
   Statistics.

Please add subsystem prefixes to subjects, like:

   "virtio-net: IPv6 support"
   "virtio-net: Statistics

And need to be more verbose. E.g:

- "Statistics" is too generic, something like "TCP coalescing
statistics" is much better.
- "virtio-net: IPv6 support" which is really confusing since it lacks
some context. Reviewers may suspect there's no ipv6 support in the past.
- "Tcp general data coalescing, the parameters is a little bit horrible,
it's complicated to read, should can be optimized later." is too long to
be a subject. "Tcp general data coalescing" should be ok. For personal
comment like "the parameters is a little bit horrible, it's complicated
to read, should can be optimized later." could be places below '---' in
the patch.

And need a more verbose commit log please. At least I could not figure
out what is happening just form most of the commit logs. [1] is a very
good documentation for how to describe your changes, please have a look
at that and describe the changes correctly in each commit log. You can
also have a look at git history to see how it was done.

[1]
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/SubmittingPatches#n106

OK, Thanks a lot.

Wei


Thanks

This applies to the cover letter too.

(nit-pick: period "." is not necessary)

Fam








Re: [Qemu-devel] [PATCH v5] qom, qmp, hmp, qapi: create qom-type-prop-list for class properties

2016-01-27 Thread Daniel P. Berrange
On Wed, Jan 27, 2016 at 01:09:37PM -0200, Eduardo Habkost wrote:
> On Tue, Jan 26, 2016 at 10:19:13PM +, Daniel P. Berrange wrote:
> > On Tue, Jan 26, 2016 at 03:26:35PM -0200, Eduardo Habkost wrote:
> > > On Tue, Jan 26, 2016 at 03:51:21PM +, Daniel P. Berrange wrote:
> > > > On Tue, Jan 26, 2016 at 01:35:38PM -0200, Eduardo Habkost wrote:
> > > > > On Mon, Jan 25, 2016 at 11:24:47AM +0300, Valentin Rakush wrote:
> > > > > > This patch adds support for qom-type-prop-list command to list 
> > > > > > object
> > > > > > class properties. A later patch will use this functionality to
> > > > > > implement x86_64-cpu properties.
> > > > > > 
> > > > > > Signed-off-by: Valentin Rakush 
> > > > > > Cc: Luiz Capitulino 
> > > > > > Cc: Eric Blake 
> > > > > > Cc: Markus Armbruster 
> > > > > > Cc: Andreas Färber 
> > > > > > Cc: Daniel P. Berrange 
> > > > > > Cc: Eduardo Habkost 
> > > > > > ---
> > > > > [...]
> > > > > > diff --git a/qmp.c b/qmp.c
> > > > > > index 53affe2..baf25c0 100644
> > > > > > --- a/qmp.c
> > > > > > +++ b/qmp.c
> > > > > > @@ -460,6 +460,37 @@ ObjectTypeInfoList *qmp_qom_list_types(bool 
> > > > > > has_implements,
> > > > > >  return ret;
> > > > > >  }
> > > > > >  
> > > > > > +ObjectPropertyInfoList *qmp_qom_type_prop_list(const char 
> > > > > > *typename, Error **errp)
> > > > > > +{
> > > > > > +ObjectClass *klass;
> > > > > > +ObjectPropertyInfoList *props = NULL;
> > > > > > +ObjectProperty *prop;
> > > > > > +ObjectPropertyIterator iter;
> > > > > > +
> > > > > > +klass = object_class_by_name(typename);
> > > > > > +if (!klass) {
> > > > > > +error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
> > > > > > +  "Object class '%s' not found", typename);
> > > > > > +return NULL;
> > > > > > +}
> > > > > > +
> > > > > > +object_class_property_iter_init(, klass);
> > > > > > +while ((prop = object_property_iter_next())) {
> > > > > > +ObjectPropertyInfoList *entry = 
> > > > > > g_new0(ObjectPropertyInfoList, 1);
> > > > > > +
> > > > > > +if (entry) {
> > > > > > +entry->value = g_new0(ObjectPropertyInfo, 1);
> > > > > > +entry->next = props;
> > > > > > +props = entry;
> > > > > > +
> > > > > > +entry->value->name = g_strdup(prop->name);
> > > > > > +entry->value->type = g_strdup(prop->type);
> > > > > > +}
> > > > > > +}
> > > > > > +
> > > > > > +return props;
> > > > > > +}
> > > > > > +
> > > > > 
> > > > > We already have "-device ,help", and it uses a completely
> > > > > different mechanism for listing properties. There's no reason for
> > > > > having two arbitrarily different APIs for listing properties
> > > > > returning different results.
> > > > > 
> > > > > If qmp_device_list_properties() is not enough for you, please
> > > > > clarify why, so we can consider improving it.
> > > > 
> > > > qmp_device_list_properties() has to actually instantiate an instance
> > > > of objects it is reporting properties against, since it is reporting
> > > > properties registered against object instances. In fact it only
> > > > reports properties against things which are TYPE_DEVICE - it'll refuse
> > > > to report other object types. Having to instantiate objects is 
> > > > inherantly
> > > > limiting to the command because there are some objects that cannot be
> > > > instantiated for this purpose. eg abstract objects and objects marked
> > > > "cannot_destroy_with_object_finalize_yet". Finally there is also a
> > > > performance and memory overhead in having to instantiate objects which
> > > > is best avoided.
> > > > 
> > > > This new API is reporting properties that are statically registered
> > > > against the *class* rather than than object instance. It is guaranteed
> > > > that you can always report these properties for any class without any
> > > > restrictions, nor any chance of side effects during instantiation.
> > > 
> > > The existing implementation has its limitations, but we can
> > > address those limitations without exporting a new API that return
> > > arbitrarily different results (that aren't even a superset of the
> > > existing API).
> > > 
> > > About the existing qmp_device_list_properties() limitations:
> > > 
> > > cannot_destroy_with_object_finalize_yet is supposed to eventually
> > > go away. If there are use cases that depend on listing properties
> > > for cannot_destroy_with_object_finalize_yet classes, we can fix
> > > that.
> > > 
> > > The TYPE_DEVICE requirement can be removed, as long as the
> > > non-device QOM classes are object_new()-safe like the existing
> > > cannot_destroy_with_object_finalize_yet=false device classes
> > > (they are supposed to be).
> > > 
> > > About having to instantiate objects: if optimizing that is so
> 

Re: [Qemu-devel] [PATCH v3] Add optionrom compatible with fw_cfg DMA version

2016-01-27 Thread Stefan Hajnoczi
On Tue, Jan 26, 2016 at 12:26:12PM +0100, Gerd Hoffmann wrote:
> On Di, 2016-01-26 at 12:20 +0100, Marc Marí wrote:
> > On Tue, 26 Jan 2016 11:11:54 +
> > Stefan Hajnoczi  wrote:
> > 
> > > On Mon, Jan 25, 2016 at 02:17:48PM +0100, Marc Marí wrote:
> > > > +linuxboot_dma.img: linuxboot_dma.o
> > > > +   $(call quiet-command,$(LD) $(LDFLAGS_NOPIE) -m elf_i386
> > > > -Ttext 0 -e _start -s -o $@ $<,"  Building $(TARGET_DIR)$@") +
> > > >  %.img: %.o
> > > > $(call quiet-command,$(LD) $(LDFLAGS_NOPIE) -Ttext 0 -e
> > > > _start -s -o $@ $<,"  Building $(TARGET_DIR)$@")  
> > > 
> > > Why is -m elf_i386 necessary for linuxboot_dma.img but not for the
> > > other *.img files?
> > 
> > I cannot give a precise explanation. But if I don't force an output
> > type, I get this error:
> > 
> > Building optionrom/linuxboot_dma.img
> > ld: i386 architecture of input file `linuxboot_dma.o' is incompatible
> > with i386:x86-64 output
> 
> Any chance the linker needs -m32 too?

I wonder why this isn't a problem for the existing firmware code.  Are
we really building x86_64 ELF files for our firmware?

Stefan


signature.asc
Description: PGP signature


[Qemu-devel] [Bug 1538541] Re: qcow2 rejects request to use preallocation with backing file

2016-01-27 Thread Max Reitz
Using any preallocation value other than none will result in all data
clusters of the new image being used. That means that any I/O request
will be served by that image, and never by the backing file. This is why
preallocating an image with a backing file is not supported, because it
generally doesn't make any sense. The backing file will never be seen
anyway.

In order to support this, qcow2 will need to support preallocated data
clusters which are explicitly marked as empty (where "empty" is not
"zero"; "empty" means "fall through to the backing file"). This has been
proposed before, but has not been implemented so far.

By the way, this is the very reason why explicitly forbidding the
combination of backing file and preallocation is very reasonable: Right
now, the backing file would be invisible, a preallocated image always
returns zeros when read. With the above feature implemented, the backing
file would be visible. In order to allow this change in behavior, we
have to make the combination an error for now.

Max

PS: The reason I write this is so that you know that this is not a bug,
but correct behavior in view of a missing feature (that should indeed be
implemented).

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1538541

Title:
  qcow2 rejects request to use preallocation with backing file

Status in QEMU:
  New

Bug description:
  The 'preallocation=full' option to qemu-img / qcow2 block driver
  instructs QEMU to fully allocate the host file to the maximum size
  needed by the logical disk size.

  $ qemu-img create -f qcow2 -o preallocation=full base.qcow2 200M
  Formatting 'base.qcow2', fmt=qcow2 size=209715200 encryption=off 
cluster_size=65536 preallocation='full' lazy_refcounts=off refcount_bits=16

  $ ls -alhs base.qcow2 
  201M -rw-r--r--. 1 berrange berrange 201M Jan 27 12:49 base.qcow2

  
  When specifying a backing file for the qcow2 file, however, it rejects the 
preallocation request

  $ qemu-img create -f qcow2 -o preallocation=full,backing_file=base.qcow2 
front.qcow2 200M
  Formatting 'front.qcow2', fmt=qcow2 size=209715200 backing_file='base.qcow2' 
encryption=off cluster_size=65536 preallocation='full' lazy_refcounts=off 
refcount_bits=16
  qemu-img: front.qcow2: Backing file and preallocation cannot be used at the 
same time

  
  It might seem like requesting full preallocation is redundant because most 
data associated with the image will be present in the backing file, as so the 
top layer is unlikely to ever need the full preallocation.  Rejecting this, 
however, means it is not (officially) possible to reserve disk space for the 
top layer to guarantee that future copy-on-writes will never get ENOSPC.

  OpenStack in particular uses backing files with all images, in order
  to avoid the I/O overhead of copying the backing file contents into
  the per-VM disk image. It, however, still wants to have a guarantee
  that the per-VM image will never hit an ENOSPC scenario.

  Currently it has to hack around QEMU's refusal to allow backing_file +
  preallocation, by calling 'fallocate' on the qcow2 file after it has
  been created. This is an inexact fix though, because it doesn't take
  account of fact that qcow2 metadata can takes some MBs of space.

  Thus, it would like to see preallocation=full supported in combination
  with backing files.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1538541/+subscriptions



[Qemu-devel] [PATCH v3 00/10] Allow hotplug of s390 CPUs

2016-01-27 Thread Matthew Rosato
Changes from v2->v3:

* Call cpu_remove_sync rather than cpu_remove().
* Pull latest version of patches from pseries set (v6).  Trivial change to 
  "Reclaim VCPU objects" to fix checkpatch error.
* Add object_unparent during s390_cpu_release to accomodate changes in 
  Patch 4 "Reclaim VCPU objects."
* Remove a cleanup patch in favor of 2 patches from pseries set.

**

The following patchset enables hotplug of s390 CPUs.

The standard interface is used -- to configure a guest with 2 CPUs online at 
boot and 4 maximum:

qemu -smp 2,maxcpus=4

To subsequently hotplug a CPU:

Issue 'device_add s390-cpu,id=' from monitor.

At this point, the guest must bring the CPU online for use -- This can be 
achieved via "echo 1 > /sys/devices/system/cpu/cpuX/online" or via a management 
tool like cpuplugd.

Hot unplug support is provided via 'device_del ', however s390 does not have
a mechanism for gracefully handling a CPU that has been removed, so this event
triggers a reset of the guest in order to force recognition.  

This patch set is based on work previously done by Jason Herne.

Bharata B Rao (3):
  exec: Remove cpu from cpus list during cpu_exec_exit()
  exec: Do vmstate unregistration from cpu_exec_exit()
  cpu: Add a sync version of cpu_remove()

Gu Zheng (1):
  cpu: Reclaim vCPU objects

Matthew Rosato (6):
  s390x/cpu: Cleanup init in preparation for hotplug
  s390x/cpu: Set initial CPU state in common routine
  s390x/cpu: Move some CPU initialization into realize
  s390x/cpu: Add functions to (un)register CPU state
  s390/virtio-ccw: Add hotplug handler and prepare for unplug
  s390x/cpu: Allow hot plug/unplug of CPUs

 cpus.c | 50 +
 exec.c | 30 
 hw/s390x/s390-virtio-ccw.c | 30 +++-
 hw/s390x/s390-virtio.c | 64 +++---
 hw/s390x/s390-virtio.h |  2 +-
 include/qom/cpu.h  | 18 
 include/sysemu/kvm.h   |  1 +
 kvm-all.c  | 57 -
 kvm-stub.c |  5 
 target-s390x/cpu.c | 70 +++---
 target-s390x/cpu.h |  4 +++
 11 files changed, 308 insertions(+), 23 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v8] spec: add qcow2 bitmaps extension specification

2016-01-27 Thread Vladimir Sementsov-Ogievskiy
The new feature for qcow2: storing bitmaps.

This patch adds new header extension to qcow2 - Bitmaps Extension. It
provides an ability to store virtual disk related bitmaps in a qcow2
image. For now there is only one type of such bitmaps: Dirty Tracking
Bitmap, which just tracks virtual disk changes from some moment.

Note: Only bitmaps, relative to the virtual disk, stored in qcow2 file,
should be stored in this qcow2 file. The size of each bitmap
(considering its granularity) is equal to virtual disk size.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

v8
- rewordings
- bitmap_directory_size: 4b -> 8b
- add more descriptive description in == Bitmaps == section
- add paragraph "Dirty tracking bitmaps"

  Bitmap directory entry:
- extra data should not allocate additional clusters
- padding must be all-bytes-zero
- add extra_data_compatible flag (now behavior in case of unknown
  extra data is defined by this flag)

v7:

- Rewordings, grammar.
  Max, Eric, John, thank you very much.

- add last paragraph: remaining bits in bitmap data clusters must be
  zero.

- s/Bitmap Directory/bitmap directory/ and other names like this at
  the request of Max.

v6:

- reword bitmap_directory_size description
- bitmap type: make 0 reserved
- extra_data_size: resize to 4bytes
  Also, I've marked this field as "must be zero". We can always change
  it, if we decide allowing managing app to specify any extra data, by
  defining some magic value as a top of user extra data.. So, for now
  non zeor extra_data_size should be considered as an error.
- swap name and extra_data to give good alignment to extra_data.


v5:

- 'Dirty bitmaps' renamed to 'Bitmaps', as we may have several types of
  bitmaps.
- rewordings
- move upper bounds to "Notes about Qemu limits"
- s/should/must somewhere. (but not everywhere)
- move name_size field closer to name itself in bitmap header
- add extra data area to bitmap header
- move bitmap data description to separate section

 docs/specs/qcow2.txt | 225 ++-
 1 file changed, 224 insertions(+), 1 deletion(-)

diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index f236d8c..7b0ebef 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -103,7 +103,18 @@ in the description of a field.
 write to an image with unknown auto-clear features if it
 clears the respective bits from this field first.
 
-Bits 0-63:  Reserved (set to 0)
+Bit 0:  Bitmaps extension bit
+This bit indicates consistency for the bitmaps
+extension data.
+
+It is an error if this bit is set without the
+bitmaps extension present.
+
+If the bitmaps extension is present but this
+bit is unset, the bitmaps extension data must 
be
+considered inconsistent.
+
+Bits 1-63:  Reserved (set to 0)
 
  96 -  99:  refcount_order
 Describes the width of a reference count block entry (width
@@ -123,6 +134,7 @@ be stored. Each extension has a structure like the 
following:
 0x - End of the header extension area
 0xE2792ACA - Backing file format name
 0x6803f857 - Feature name table
+0x23852875 - Bitmaps extension
 other  - Unknown header extension, can be safely
  ignored
 
@@ -166,6 +178,36 @@ the header extension data. Each entry look like this:
 terminated if it has full length)
 
 
+== Bitmaps extension ==
+
+The bitmaps extension is an optional header extension. It provides the ability
+to store bitmaps related to a virtual disk. For now, there is only one bitmap
+type: the dirty tracking bitmap, which tracks virtual disk changes from some
+point in time.
+
+The data of the extension should be considered consistent only if the
+corresponding auto-clear feature bit is set, see autoclear_features above.
+
+The fields of the bitmaps extension are:
+
+Byte  0 -  3:  nb_bitmaps
+   The number of bitmaps contained in the image. Must be
+   greater than or equal to 1.
+
+   Note: Qemu currently only supports up to 65535 bitmaps per
+   image.
+
+  4 -  7:  Reserved, must be zero.
+
+  8 - 15:  bitmap_directory_size
+   Size of the bitmap directory in bytes. It is the cumulative
+   size of all (nb_bitmaps) bitmap headers.
+
+ 16 - 23:  bitmap_directory_offset
+   Offset into the image file at which the bitmap directory
+   starts. Must 

Re: [Qemu-devel] VFIO based vGPU(was Re: [Announcement] 2015-Q3 release of XenGT - a Mediated ...)

2016-01-27 Thread Alex Williamson
On Wed, 2016-01-27 at 13:36 +0530, Kirti Wankhede wrote:
> 
> On 1/27/2016 1:36 AM, Alex Williamson wrote:
> > On Tue, 2016-01-26 at 02:20 -0800, Neo Jia wrote:
> > > On Mon, Jan 25, 2016 at 09:45:14PM +, Tian, Kevin wrote:
> > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > >   
> > > Hi Alex, Kevin and Jike,
> > >   
> > > (Seems I shouldn't use attachment, resend it again to the list, patches 
> > > are
> > > inline at the end)
> > >   
> > > Thanks for adding me to this technical discussion, a great opportunity
> > > for us to design together which can bring both Intel and NVIDIA vGPU 
> > > solution to
> > > KVM platform.
> > >   
> > > Instead of directly jumping to the proposal that we have been working on
> > > recently for NVIDIA vGPU on KVM, I think it is better for me to put out 
> > > couple
> > > quick comments / thoughts regarding the existing discussions on this 
> > > thread as
> > > fundamentally I think we are solving the same problem, DMA, interrupt and 
> > > MMIO.
> > >   
> > > Then we can look at what we have, hopefully we can reach some consensus 
> > > soon.
> > >   
> > > > Yes, and since you're creating and destroying the vgpu here, this is
> > > > where I'd expect a struct device to be created and added to an IOMMU
> > > > group.  The lifecycle management should really include links between
> > > > the vGPU and physical GPU, which would be much, much easier to do with
> > > > struct devices create here rather than at the point where we start
> > > > doing vfio "stuff".
> > >   
> > > Infact to keep vfio-vgpu to be more generic, vgpu device creation and 
> > > management
> > > can be centralized and done in vfio-vgpu. That also include adding to 
> > > IOMMU
> > > group and VFIO group.
> > Is this really a good idea?  The concept of a vgpu is not unique to
> > vfio, we want vfio to be a driver for a vgpu, not an integral part of
> > the lifecycle of a vgpu.  That certainly doesn't exclude adding
> > infrastructure to make lifecycle management of a vgpu more consistent
> > between drivers, but it should be done independently of vfio.  I'll go
> > back to the SR-IOV model, vfio is often used with SR-IOV VFs, but vfio
> > does not create the VF, that's done in coordination with the PF making
> > use of some PCI infrastructure for consistency between drivers.
> > 
> > It seems like we need to take more advantage of the class and driver
> > core support to perhaps setup a vgpu bus and class with vfio-vgpu just
> > being a driver for those devices.
> 
> For device passthrough or SR-IOV model, PCI devices are created by PCI 
> bus driver and from the probe routine each device is added in vfio group.

An SR-IOV VF is created by the PF driver using standard interfaces
provided by the PCI core.  The IOMMU group for a VF is added by the
IOMMU driver when the device is created on the pci_bus_type.  The probe
routine of the vfio bus driver (vfio-pci) is what adds the device into
the vfio group.

> For vgpu, there should be a common module that create vgpu device, say 
> vgpu module, add vgpu device to an IOMMU group and then add it to vfio 
> group.  This module can handle management of vgpus. Advantage of keeping 
> this module a separate module than doing device creation in vendor 
> modules is to have generic interface for vgpu management, for example, 
> files /sys/class/vgpu/vgpu_start and  /sys/class/vgpu/vgpu_shudown and 
> vgpu driver registration interface.

But you're suggesting something very different from the SR-IOV model.
If we wanted to mimic that model, the GPU specific driver should create
the vgpu using services provided by a common interface.  For instance
i915 could call a new vgpu_device_create() which creates the device,
adds it to the vgpu class, etc.  That vgpu device should not be assumed
to be used with vfio though, that should happen via a separate probe
using a vfio-vgpu driver.  It's that vfio bus driver that will add the
device to a vfio group.

> In the patch, vgpu_dev.c + vgpu_sysfs.c form such vgpu module and 
> vgpu_vfio.c is for VFIO interface. Each vgpu device should be added to 
> vfio group, so vgpu_group_init() from vgpu_vfio.c should be called per 
> device. In the vgpu module, vgpu devices are created on request, so 
> vgpu_group_init() should be called explicitly for per vgpu device. 
>   That’s why had merged the 2 modules, vgpu + vgpu_vfio to form one vgpu 
> module.  Vgpu_vfio would remain separate entity but merged with vgpu 
> module.

I disagree with this design, creation of a vgpu necessarily involves the
GPU driver and should not be tied to use of the vgpu with vfio.  vfio
should be a driver for the device, maybe eventually not the only driver
for the device.  Thanks,

Alex




Re: [Qemu-devel] [RFC PATCH 04/16] block: Move filename_decompose to block.c

2016-01-27 Thread Eric Blake
On 01/26/2016 03:38 AM, Fam Zheng wrote:
> With the return value decoupled from VMDK, it can be reused by other block
> code.
> 
> Signed-off-by: Fam Zheng 
> ---
>  block.c   | 40 
>  block/vmdk.c  | 40 
>  include/block/block.h |  2 ++
>  3 files changed, 42 insertions(+), 40 deletions(-)
> 

> +++ b/block.c
> @@ -144,6 +144,46 @@ int path_is_absolute(const char *path)
>  #endif
>  }
>  
> +int filename_decompose(const char *filename, char *path, char *prefix,
> +   char *postfix, size_t buf_len, Error **errp)
> +{
> +const char *p, *q;
> +
> +if (filename == NULL || !strlen(filename)) {
> +error_setg(errp, "No filename provided");
> +return -EINVAL;
> +}
> +p = strrchr(filename, '/');
> +if (p == NULL) {
> +p = strrchr(filename, '\\');
> +}

I know this is just code motion, but it feels like it does the wrong
thing on Unix boxes (trying too hard to appease Windows boxes).  Is that
something that needs to be independently addressed?

But as for this patch, the code motion is fine.
Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PATCH v3 01/10] exec: Remove cpu from cpus list during cpu_exec_exit()

2016-01-27 Thread Matthew Rosato
From: Bharata B Rao 

CPUState *cpu gets added to the cpus list during cpu_exec_init(). It
should be removed from cpu_exec_exit().

cpu_exec_init() is called from generic CPU::instance_finalize and some
archs like PowerPC call it from CPU unrealizefn. So ensure that we
dequeue the cpu only once.

Now -1 value for cpu->cpu_index indicates that we have already dequeued
the cpu for CONFIG_USER_ONLY case also.

Signed-off-by: Bharata B Rao 
Reviewed-by: David Gibson 
---
 exec.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/exec.c b/exec.c
index 7115403..c8da9d4 100644
--- a/exec.c
+++ b/exec.c
@@ -596,6 +596,7 @@ void cpu_exec_exit(CPUState *cpu)
 return;
 }
 
+QTAILQ_REMOVE(, cpu, node);
 bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
 cpu->cpu_index = -1;
 }
@@ -614,6 +615,15 @@ static int cpu_get_free_index(Error **errp)
 
 void cpu_exec_exit(CPUState *cpu)
 {
+cpu_list_lock();
+if (cpu->cpu_index == -1) {
+cpu_list_unlock();
+return;
+}
+
+QTAILQ_REMOVE(, cpu, node);
+cpu->cpu_index = -1;
+cpu_list_unlock();
 }
 #endif
 
-- 
1.9.1




Re: [Qemu-devel] [PATCH v2] virtio-pci: call pci reset variant when guest clears status.

2016-01-27 Thread Michael S. Tsirkin
On Wed, Jan 27, 2016 at 03:09:58PM +0100, Gerd Hoffmann wrote:
> Actually fixes linux not finding virtio 1.0 device virtqueues after
> reboot.  Which is new I think, any chance linux kernel virtio code
> became more strict in 4.3?
> 
> Signed-off-by: Gerd Hoffmann 
> ---
>  hw/virtio/virtio-pci.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 94667e6..8213d94 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -47,6 +47,7 @@
>  
>  static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
> VirtIOPCIProxy *dev);
> +static void virtio_pci_reset(DeviceState *qdev);
>  
>  /* virtio device */
>  /* DeviceState to VirtIOPCIProxy. For use off data-path. TODO: use QOM. */
> @@ -432,8 +433,7 @@ static void virtio_ioport_write(void *opaque, uint32_t 
> addr, uint32_t val)
>  }
>  
>  if (vdev->status == 0) {
> -virtio_reset(vdev);
> -msix_unuse_all_vectors(>pci_dev);
> +virtio_pci_reset(DEVICE(proxy));
>  }
>  
>  /* Linux before 2.6.34 drives the device without enabling

Aren't there two call sites in virtio_ioport_write?

> @@ -1351,8 +1351,7 @@ static void virtio_pci_common_write(void *opaque, 
> hwaddr addr,
>  }
>  
>  if (vdev->status == 0) {
> -virtio_reset(vdev);
> -msix_unuse_all_vectors(>pci_dev);
> +virtio_pci_reset(DEVICE(proxy));
>  }
>  
>  break;
> -- 
> 1.8.3.1



Re: [Qemu-devel] [RFC PATCH 05/16] block: Make bdrv_get_cluster_size public

2016-01-27 Thread Eric Blake
On 01/26/2016 03:38 AM, Fam Zheng wrote:
> Signed-off-by: Fam Zheng 
> ---
>  block/io.c| 2 +-
>  include/block/block.h | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)

Reviewed-by: Eric Blake 

> 
> diff --git a/block/io.c b/block/io.c
> index b964e7e..15e461f 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -425,7 +425,7 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
>  }
>  }
>  
> -static int bdrv_get_cluster_size(BlockDriverState *bs)
> +int bdrv_get_cluster_size(BlockDriverState *bs)
>  {

Worth adding a doc comment while touching it?

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v6 05/11] cpu: Reclaim vCPU objects

2016-01-27 Thread Matthew Rosato
On 01/08/2016 01:55 AM, Bharata B Rao wrote:
> From: Gu Zheng 
> 
> In order to deal well with the kvm vcpus (which can not be removed without any
> protection), we do not close KVM vcpu fd, just record and mark it as stopped
> into a list, so that we can reuse it for the appending cpu hot-add request if
> possible. It is also the approach that kvm guys suggested:
> https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> 
> Signed-off-by: Chen Fan 
> Signed-off-by: Gu Zheng 
> Signed-off-by: Zhu Guihua 
> Signed-off-by: Bharata B Rao 
>[- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
>   isn't needed as it is done from cpu_exec_exit()
> - Use iothread mutex instead of global mutex during
>   destroy
> - Don't cleanup vCPU object from vCPU thread context
>   but leave it to the callers (device_add/device_del)]
> ---
>  cpus.c   | 38 +++
>  include/qom/cpu.h| 10 +
>  include/sysemu/kvm.h |  1 +
>  kvm-all.c| 57 
> +++-
>  kvm-stub.c   |  5 +
>  5 files changed, 110 insertions(+), 1 deletion(-)
> 
> diff --git a/cpus.c b/cpus.c
> index ea29584..12374af 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -953,6 +953,18 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void 
> *data), void *data)
>  qemu_cpu_kick(cpu);
>  }
> 
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +if (kvm_destroy_vcpu(cpu) < 0) {
> +error_report("kvm_destroy_vcpu failed.\n");

FYI, checkpatch fails here -- no need for the newline.

Matt




Re: [Qemu-devel] VFIO based vGPU(was Re: [Announcement] 2015-Q3 release of XenGT - a Mediated ...)

2016-01-27 Thread Alex Williamson
On Wed, 2016-01-27 at 01:14 -0800, Neo Jia wrote:
> On Tue, Jan 26, 2016 at 04:30:38PM -0700, Alex Williamson wrote:
> > On Tue, 2016-01-26 at 14:28 -0800, Neo Jia wrote:
> > > On Tue, Jan 26, 2016 at 01:06:13PM -0700, Alex Williamson wrote:
> > > > > 1.1 Under per-physical device sysfs:
> > > > > --
> > > > >  
> > > > > vgpu_supported_types - RO, list the current supported virtual GPU 
> > > > > types and its
> > > > > VGPU_ID. VGPU_ID - a vGPU type identifier returned from reads of
> > > > > "vgpu_supported_types".
> > > > > 
> > > > > vgpu_create - WO, input syntax , create a virtual
> > > > > gpu device on a target physical GPU. idx: virtual device index inside 
> > > > > a VM
> > > > >  
> > > > > vgpu_destroy - WO, input syntax , destroy a virtual gpu 
> > > > > device on a
> > > > > target physical GPU
> > > >  
> > > >  
> > > > I've noted in previous discussions that we need to separate user policy
> > > > from kernel policy here, the kernel policy should not require a "VM
> > > > UUID".  A UUID simply represents a set of one or more devices and an
> > > > index picks the device within the set.  Whether that UUID matches a VM
> > > > or is independently used is up to the user policy when creating the
> > > > device.
> > > >  
> > > > Personally I'd also prefer to get rid of the concept of indexes within a
> > > > UUID set of devices and instead have each device be independent.  This
> > > > seems to be an imposition on the nvidia implementation into the kernel
> > > > interface design.
> > > >  
> > >  
> > > Hi Alex,
> > >  
> > > I agree with you that we should not put UUID concept into a kernel API. At
> > > this point (without any prototyping), I am thinking of using a list of 
> > > virtual
> > > devices instead of UUID.
> > 
> > Hi Neo,
> > 
> > A UUID is a perfectly fine name, so long as we let it be just a UUID and
> > not the UUID matching some specific use case.
> > 
> > > > >  
> > > > > int vgpu_map_virtual_bar
> > > > > (
> > > > > uint64_t virt_bar_addr,
> > > > > uint64_t phys_bar_addr,
> > > > > uint32_t len,
> > > > > uint32_t flags
> > > > > )
> > > > >  
> > > > > EXPORT_SYMBOL(vgpu_map_virtual_bar);
> > > >  
> > > >  
> > > > Per the implementation provided, this needs to be implemented in the
> > > > vfio device driver, not in the iommu interface.  Finding the DMA mapping
> > > > of the device and replacing it is wrong.  It should be remapped at the
> > > > vfio device file interface using vm_ops.
> > > >  
> > >  
> > > So you are basically suggesting that we are going to take a mmap fault and
> > > within that fault handler, we will go into vendor driver to look up the
> > > "pre-registered" mapping and remap there.
> > >  
> > > Is my understanding correct?
> > 
> > Essentially, hopefully the vendor driver will have already registered
> > the backing for the mmap prior to the fault, but either way could work.
> > I think the key though is that you want to remap it onto the vma
> > accessing the vfio device file, not scanning it out of an IOVA mapping
> > that might be dynamic and doing a vma lookup based on the point in time
> > mapping of the BAR.  The latter doesn't give me much confidence that
> > mappings couldn't change while the former should be a one time fault.
> 
> Hi Alex,
> 
> The fact is that the vendor driver can only prevent such mmap fault by looking
> up the  mapping table that we have saved from IOMMU memory 
> listerner

Why do we need to prevent the fault?  We need to handle the fault when
it occurs.

> when the guest region gets programmed. Also, like you have mentioned below, 
> such
> mapping between iova and hva shouldn't be changed as long as the SBIOS and
> guest OS are done with their job. 

But you don't know they're done with their job.

> Yes, you are right it is one time fault, but the gpu work is heavily 
> pipelined. 

Why does that matter?  We're talking about the first time the VM
accesses the range of the BAR that will be direct mapped to the physical
GPU.  This isn't going to happen in the middle of a benchmark, it's
going to happen during driver initialization in the guest.

> Probably we should just limit this interface to guest MMIO region and we can 
> have
> some crosscheck between the VFIO driver who has monitored the config spcae
> access to make sure nothing getting moved around?

No, the solution for the bar is very clear, map on fault to the vma
accessing the mmap and be done with it for the remainder of this
instance of the VM.

> > In case it's not clear to folks at Intel, the purpose of this is that a
> > vGPU may directly map a segment of the physical GPU MMIO space, but we
> > may not know what segment that is at setup time, when QEMU does an mmap
> > of the vfio device file descriptor.  The thought is that we can create
> > an invalid mapping when QEMU calls mmap(), knowing that it won't 

Re: [Qemu-devel] [PATCH] net/traffic-mirrorer:Add traffic-mirroer

2016-01-27 Thread Eric Blake
On 01/26/2016 05:44 PM, Hailiang Zhang wrote:
> On 2016/1/26 16:59, Zhang Chen wrote:
>> From: ZhangChen 
>>
>> Traffic-mirrorer is a plugin of netfilter.
>> It make qemu has ability to copy and mirror guest's
>> net packet. we output packet to chardev.
>>

>> +static void traffic_mirrorer_setup(NetFilterState *nf, Error **errp)
>> +{
>> +MirrorerState *s = FILTER_TRAFFIC_MIRRORER(nf);
>> +
>> +if (!s->outdev) {
>> +error_setg(errp, "filter traffic mirrorer needs 'outdev'
>> property set!"
>> +"property set!");
> 
> Duplicate 'property set!'.

For that matter, error_setg() messages should never end in '!'.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] VFIO based vGPU(was Re: [Announcement] 2015-Q3 release of XenGT - a Mediated ...)

2016-01-27 Thread Alex Williamson
On Wed, 2016-01-27 at 13:43 +0800, Jike Song wrote:
> On 01/27/2016 11:07 AM, Alex Williamson wrote:
> > On Wed, 2016-01-27 at 09:47 +0800, Jike Song wrote:
> > > On 01/27/2016 06:56 AM, Alex Williamson wrote:
> > > > On Tue, 2016-01-26 at 22:39 +, Tian, Kevin wrote:
> > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > Sent: Wednesday, January 27, 2016 6:27 AM
> > > > > >  
> > > > > > On Tue, 2016-01-26 at 22:15 +, Tian, Kevin wrote:
> > > > > > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > > > > > Sent: Wednesday, January 27, 2016 6:08 AM
> > > > > > > >  
> > > > > > > > > > > >  
> > > > > > > > > > >  
> > > > > > > > > > > Today KVMGT (not using VFIO yet) registers I/O emulation 
> > > > > > > > > > > callbacks to
> > > > > > > > > > > KVM, so VM MMIO access will be forwarded to KVMGT 
> > > > > > > > > > > directly for
> > > > > > > > > > > emulation in kernel. If we reuse above R/W flags, the 
> > > > > > > > > > > whole emulation
> > > > > > > > > > > path would be unnecessarily long with obvious performance 
> > > > > > > > > > > impact. We
> > > > > > > > > > > either need a new flag here to indicate in-kernel 
> > > > > > > > > > > emulation (bias from
> > > > > > > > > > > passthrough support), or just hide the region 
> > > > > > > > > > > alternatively (let KVMGT
> > > > > > > > > > > to handle I/O emulation itself like today).
> > > > > > > > > >  
> > > > > > > > > > That sounds like a future optimization TBH.  There's very 
> > > > > > > > > > strict
> > > > > > > > > > layering between vfio and kvm.  Physical device assignment 
> > > > > > > > > > could make
> > > > > > > > > > use of it as well, avoiding a round trip through userspace 
> > > > > > > > > > when an
> > > > > > > > > > ioread/write would do.  Userspace also needs to orchestrate 
> > > > > > > > > > those kinds
> > > > > > > > > > of accelerators, there might be cases where userspace wants 
> > > > > > > > > > to see those
> > > > > > > > > > transactions for debugging or manipulating the device.  We 
> > > > > > > > > > can't simply
> > > > > > > > > > take shortcuts to provide such direct access.  Thanks,
> > > > > > > > > >  
> > > > > > > > >  
> > > > > > > > > But we have to balance such debugging flexibility and 
> > > > > > > > > acceptable performance.
> > > > > > > > > To me the latter one is more important otherwise there'd be 
> > > > > > > > > no real usage
> > > > > > > > > around this technique, while for debugging there are other 
> > > > > > > > > alternative (e.g.
> > > > > > > > > ftrace) Consider some extreme case with 100k traps/second and 
> > > > > > > > > then see
> > > > > > > > > how much impact a 2-3x longer emulation path can bring...
> > > > > > > >  
> > > > > > > > Are you jumping to the conclusion that it cannot be done with 
> > > > > > > > proper
> > > > > > > > layering in place?  Performance is important, but it's not an 
> > > > > > > > excuse to
> > > > > > > > abandon designing interfaces between independent components.  
> > > > > > > > Thanks,
> > > > > > > >  
> > > > > > >  
> > > > > > > Two are not controversial. My point is to remove unnecessary long 
> > > > > > > trip
> > > > > > > as possible. After another thought, yes we can reuse existing 
> > > > > > > read/write
> > > > > > > flags:
> > > > > > >   - KVMGT will expose a private control variable whether in-kernel
> > > > > > > delivery is required;
> > > > > >  
> > > > > > But in-kernel delivery is never *required*.  Wouldn't userspace 
> > > > > > want to
> > > > > > deliver in-kernel any time it possibly could?
> > > > > >  
> > > > > > >   - when the variable is true, KVMGT will register in-kernel MMIO
> > > > > > > emulation callbacks then VM MMIO request will be delivered to 
> > > > > > > KVMGT
> > > > > > > directly;
> > > > > > >   - when the variable is false, KVMGT will not register anything.
> > > > > > > VM MMIO request will then be delivered to Qemu and then 
> > > > > > > ioread/write
> > > > > > > will be used to finally reach KVMGT emulation logic;
> > > > > >  
> > > > > > No, that means the interface is entirely dependent on a backdoor 
> > > > > > through
> > > > > > KVM.  Why can't userspace (QEMU) do something like register an MMIO
> > > > > > region with KVM handled via a provided file descriptor and offset,
> > > > > > couldn't KVM then call the file ops without a kernel exit?  Thanks,
> > > > > >  
> > > > >  
> > > > > Could you elaborate this thought? If it can achieve the purpose w/o
> > > > > a kernel exit definitely we can adapt to it. :-)
> > > >  
> > > > I only thought of it when replying to the last email and have been doing
> > > > some research, but we already do quite a bit of synchronization through
> > > > file descriptors.  The kvm-vfio pseudo device uses a group file
> > > > descriptor to ensure a user has access to a group, allowing some degree
> > > > of interaction between modules.  Eventfds and irqfds already make 

Re: [Qemu-devel] [RFC PATCH v2 00/10] Add colo-proxy based on netfilter

2016-01-27 Thread Eric Blake
On 01/22/2016 12:46 AM, Wen Congyang wrote:
...

> On 01/20/2016 11:29 AM, Zhang Chen wrote:
> Sure.

Wow, that's a lot of wasted quoting.  Your mail weighed in at 24k, even
though...


>> Thanks. I think I get the point. So if there's a difference, primary
>> packet will only be sent after checkpoint and we could not assume the
>> checkpoint itself is reliable.
> 
> Yes.
> 
>>
>> Back to the filters design. We'd better still decouple packet comparing
>> out of netdev. Maybe a little bit more tweak on what you've suggested:
>>
>> -netdev tap,id=hn0
>> -object traffic-mirrorer,id=f0,netdev=hn0,queue=tx,outdev=mirrorer0
>> -object
>> traffic-redirector,id=f1,netdev=hn0,queue=rx,outdev=comparer0,indev=comparer2
>> -colo-comparer
>> primary_traffic=comparer0,secondary_traffic=comparer1,outdev=comparer2
>>
>> Just add one more socket for comparer for sending primary packet, and
>> let f1 redirector its output to netdev?
> 
> OK, I understand it now.
> Thanks for your suggestion.

...content-wise, you only added about 100 bytes.  It's okay to trim
replies down to relevant portions, to make it easier for readers to get
to the meat of your message.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v14 3/8] Backup: clear all bitmap when doing block checkpoint

2016-01-27 Thread Stefan Hajnoczi
On Wed, Jan 13, 2016 at 05:18:27PM +0800, Changlong Xie wrote:
> diff --git a/blockjob.c b/blockjob.c
> index 80adb9d..0c8edfe 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -533,3 +533,14 @@ void block_job_txn_add_job(BlockJobTxn *txn, BlockJob 
> *job)
>  QLIST_INSERT_HEAD(>jobs, job, txn_list);
>  block_job_txn_ref(txn);
>  }
> +
> +void block_job_do_checkpoint(BlockJob *job, Error **errp)
> +{
> +if (!job->driver->do_checkpoint) {
> +error_setg(errp, "The job %s doesn't support block checkpoint",
> +   BlockJobType_lookup[job->driver->job_type]);
> +return;
> +}
> +
> +job->driver->do_checkpoint(job, errp);
> +}
> diff --git a/include/block/blockjob.h b/include/block/blockjob.h
> index d84ccd8..abdba7c 100644
> --- a/include/block/blockjob.h
> +++ b/include/block/blockjob.h
> @@ -70,6 +70,9 @@ typedef struct BlockJobDriver {
>   * never both.
>   */
>  void (*abort)(BlockJob *job);
> +
> +/** Optional callback for job types that support checkpoint. */
> +void (*do_checkpoint)(BlockJob *job, Error **errp);

The COLO/replication-specific callbacks have been moved out of
BlockDriver into their own replication struct.  Similar reasoning
applies to BlockJobDriver:

The do_checkpoint() callback is only implemented by one type of job and
its purpose is related to COLO rather than jobs.  This is a strong
indication that this shouldn't be part of the generic BlockJobDriver
struct.

Please drop changes to the generic blockjob interface.  Instead, make
backup_do_checkpoint() public and add assert(job->driver->type ==
BLOCK_JOB_TYPE_BACKUP) into the function.

Then the replication filter can call backup_do_checkpoint() directly.

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v9 32/37] qapi: Rework deallocation of partial struct

2016-01-27 Thread Markus Armbruster
Eric Blake  writes:

> Commit cee2dedb noticed that if you have a partial flat union
> (such as if an input parse failed due to a missing
> discriminator), calling the dealloc visitor could result in
> trying to dereference the NULL pointer. But the fix it proposed
> requires the use of a 'data' member in the union, which may or
> may not be the same size as other branches of the union
> (consider a 32-bit platform where one of the branches is an
> int64), so it feels fairly dirty.  A better fix is to tweak all
> of the generated visit_type_implicit_FOO() functions to avoid
> dereferencing NULL in the first place, by not visiting the
> fields if the struct pointer itself is not present, at which
> point we no longer even need visit_start_union().  And no one
> was implementing visit_end_union() callbacks.
>
> While rewriting the code, use patterns that are closer to what
> is used elsewhere in the generated visitors, by using 'goto'
> to cleanup labels rather than putting followup code under 'if'
> conditions.  The change keeps the contract that any successful
> use of visit_start_implicit_struct() will be paired with a
> matching visit_end_implicit_struct(), even if intermediate
> processing is skipped.  We are safe in checking *obj alone, as
> as the contract of visit_start_implicit_struct() requires a
> non-NULL obj.
>
> As an example of the changes to generated code:

This could be easier to understand if you show the change to the union
visit (hunks 2+3) before the change to one of its variant members (hunk
1).

> |@@ -1331,10 +1331,16 @@ static void visit_type_implicit_Blockdev
> | Error *err = NULL;
> |
> | visit_start_implicit_struct(v, (void **)obj, 
> sizeof(BlockdevOptionsArchipelago), );
> |-if (!err) {
> |-visit_type_BlockdevOptionsArchipelago_fields(v, obj, errp);
> |-visit_end_implicit_struct(v);
> |+if (err) {
> |+goto out;
> |+}
> |+if (!*obj) {
> |+goto out_obj;
> | }
> |+visit_type_BlockdevOptionsArchipelago_fields(v, obj, );
> |+out_obj:
> |+visit_end_implicit_struct(v);
> |+out:
> | error_propagate(errp, err);
> | }
> ...
> |@@ -1479,9 +1539,6 @@ void visit_type_BlockdevOptions(Visitor
> | if (err) {
> | goto out_obj;
> | }
> |-if (!visit_start_union(v, !!(*obj)->u.data, ) || err) {
> |-goto out_obj;
> |-}

If v is the dealloc visitor, the condition is !(*obj)->u.data.
Else, it's false.

(*obj)->u.data can be null only for a partially initialized obj.

Spelling out the obvious: partially initialized objects may be visited
with the dealloc visitor only.

So, this basically boils down to "if we're deallocating a partially
initialized object, and the variant part hasn't been initialized, bypass
the switch visiting the variant part.

Your patch changes it to visit the variant part unconditionally.  The
code doing that visit needs to be able to cope with an uninitialized
part.

> | switch ((*obj)->driver) {

Which variant do we visit?  If the tag hasn't been initialized, we
arbitrarily visit the first one.  Amazingly, this actually works, as we
shall see.

> | case BLOCKDEV_DRIVER_ARCHIPELAGO:
> | visit_type_implicit_BlockdevOptionsArchipelago(v, 
> &(*obj)->u.archipelago, );

Note that (*obj)->u.archipelago equals (*obj)->u.data by construction of
the union's C data type: the variant members are all stored boxed.

So, when (and only when, I think) we visit an uninitialized variant, the
visit function's obj parameter points to a null pointer.

break;
[More cases...]
> |@@ -1570,11 +1627,6 @@ void visit_type_BlockdevOptions(Visitor
default:
abort();
}
> | out_obj:
> | error_propagate(errp, err);
> | err = NULL;
> |-if (*obj) {
> |-visit_end_union(v, !!(*obj)->u.data, );
> |-}
> |-error_propagate(errp, err);
> |-err = NULL;
> | visit_end_struct(v, );

Now let's see how function to visit the variant part changes.  Before:

static void visit_type_implicit_BlockdevOptionsArchipelago(Visitor *v, 
BlockdevOptionsArchipelago **obj, Error **errp)
{
Error *err = NULL;

visit_start_implicit_struct(v, (void **)obj, 
sizeof(BlockdevOptionsArchipelago), );
if (!err) {
visit_type_BlockdevOptionsArchipelago_fields(v, obj, errp);
visit_end_implicit_struct(v);
}
error_propagate(errp, err);
}

After:

static void visit_type_implicit_BlockdevOptionsArchipelago(Visitor *v, 
BlockdevOptionsArchipelago **obj, Error **errp)
{
Error *err = NULL;

visit_start_implicit_struct(v, (void **)obj, 
sizeof(BlockdevOptionsArchipelago), );
if (err) {
goto out;
}
if (!*obj) {
goto out_obj;
}
visit_type_BlockdevOptionsArchipelago_fields(v, obj, );
out_obj:
visit_end_implicit_struct(v);
out:
error_propagate(errp, err);

Re: [Qemu-devel] [RFC 6/7] hw: arm: virt: register reserved IOVA region

2016-01-27 Thread Pavel Fedin
 Hello!

> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 3839c68..7eaf8be 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -125,6 +125,7 @@ static const MemMapEntry a15memmap[] = {
>  [VIRT_GPIO] =   { 0x0903, 0x1000 },
>  [VIRT_SECURE_UART] ={ 0x0904, 0x1000 },
>  [VIRT_MMIO] =   { 0x0a00, 0x0200 },
> +[VIRT_RESERVED] =   { 0x0be0, 0x0010 },

 Looks like with this approach we would need to add this to all machine models 
which make use of PCI. But is it a good idea? As far
as i understand, the only requirement for this region is not to clash with 
guest RAM addresses. So, can we instead have some code,
which automatically finds some place, based on the size? For now we hardcode 
the size to 0x0010, but in future we could query
the host for the size, because it's still host's MSI controller.

Kind regards,
Pavel Fedin
Senior Engineer
Samsung Electronics Research center Russia





[Qemu-devel] [PATCH 2/3] pcdimm: add 'type' field to PCDIMMDeviceInfo

2016-01-27 Thread Vladimir Sementsov-Ogievskiy
The field is needed to distinguish pc-dimm and nvdimm.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
CC: Xiao Guangrong 
CC: "Michael S. Tsirkin" 
CC: Igor Mammedov 
CC: Eric Blake 
CC: Markus Armbruster 
---
 hw/mem/pc-dimm.c | 1 +
 qapi-schema.json | 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 4f30950..7469bd4 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -178,6 +178,7 @@ int qmp_pc_dimm_device_list(Object *obj, void *opaque)
 di->size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP,
NULL);
 di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem));
+di->type = g_strdup(object_get_typename(obj));
 
 info->u.dimm = di;
 elem->value = info;
diff --git a/qapi-schema.json b/qapi-schema.json
index 8d04897..3bcc957 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3924,6 +3924,8 @@
 #
 # @hotpluggable: true if device if could be added/removed while machine is 
running
 #
+# @type: device type: 'pc-dimm' or 'nvdimm' (since 2.6)
+#
 # Since: 2.1
 ##
 { 'struct': 'PCDIMMDeviceInfo',
@@ -3934,7 +3936,8 @@
 'node': 'int',
 'memdev': 'str',
 'hotplugged': 'bool',
-'hotpluggable': 'bool'
+'hotpluggable': 'bool',
+'type': 'str'
   }
 }
 
-- 
1.8.3.1




[Qemu-devel] [PATCH 3/3] balloon: don't use NVDIMM for ballooning

2016-01-27 Thread Vladimir Sementsov-Ogievskiy
NVDIMM for now is planned to use as a backing store for DAX filesystem
in the guest and thus this memory is excluded from guest memory
management and LRUs.

In this case libvirt running QEMU along with configured balloon almost
immediately inflates balloon and effectively kill the guest as
qemu counts nvdimm as part of the ram.

Counting dimm devices as part of the ram for ballooning was started from
commit 463756d03:
 virtio-balloon: Fix balloon not working correctly when hotplug memory

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
CC: Xiao Guangrong 
CC: "Michael S. Tsirkin" 
CC: Igor Mammedov 
CC: Eric Blake 
CC: Markus Armbruster 
---
 hw/virtio/virtio-balloon.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 6a4c4d2..749be25 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -26,6 +26,7 @@
 #include "qapi/visitor.h"
 #include "qapi-event.h"
 #include "trace.h"
+#include "hw/mem/nvdimm.h"
 
 #if defined(__linux__)
 #include 
@@ -308,7 +309,9 @@ static ram_addr_t get_current_ram_size(void)
 if (value) {
 switch (value->type) {
 case MEMORY_DEVICE_INFO_KIND_DIMM:
-size += value->u.dimm->size;
+if (strcmp(value->u.dimm->type, TYPE_NVDIMM)) {
+size += value->u.dimm->size;
+}
 break;
 default:
 break;
-- 
1.8.3.1




[Qemu-devel] [PATCH v4 0/3] don't use NVDIMM for balooning

2016-01-27 Thread Vladimir Sementsov-Ogievskiy
v4:
 0001: Reviewed-by: Eric Blake 
 second patch is splitted to 0002 and 0003
 0002: Add 'type' field instead of 'balloonable' to PCDIMMDeviceInfo
 0003: chec 'type' instead of 'balloonable'

v3:
- do not use additional class variable

NVDIMM for now is planned to use as a backing store for DAX filesystem
in the guest and thus this memory is excluded from guest memory
management and LRUs.

In this case libvirt running QEMU along with configured balloon almost
immediately inflates balloon and effectively kill the guest as
qemu counts nvdimm as part of the ram.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
CC: Xiao Guangrong 
CC: "Michael S. Tsirkin" 
CC: Igor Mammedov 
CC: Eric Blake 
CC: Markus Armbruster 

Vladimir Sementsov-Ogievskiy (3):
  move get_current_ram_size to virtio-balloon.c
  pcdimm: add 'type' field to PCDIMMDeviceInfo
  balloon: don't use NVDIMM for ballooning

 hw/mem/pc-dimm.c| 27 +--
 hw/virtio/virtio-balloon.c  | 29 +
 include/exec/cpu-common.h   |  1 -
 qapi-schema.json|  5 -
 stubs/qmp_pc_dimm_device_list.c |  5 -
 5 files changed, 34 insertions(+), 33 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH 1/3] move get_current_ram_size to virtio-balloon.c

2016-01-27 Thread Vladimir Sementsov-Ogievskiy
get_current_ram_size() is used only in virtio-balloon.c
This patch moves it into virtio-balloon and make it static, to allow
some balloon-specific tuning.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
Reviewed-by: Eric Blake 

CC: Stefan Hajnoczi 
CC: Xiao Guangrong 
CC: "Michael S. Tsirkin" 
CC: Igor Mammedov 
CC: Eric Blake 
CC: Markus Armbruster 
---
 hw/mem/pc-dimm.c| 26 --
 hw/virtio/virtio-balloon.c  | 26 ++
 include/exec/cpu-common.h   |  1 -
 stubs/qmp_pc_dimm_device_list.c |  5 -
 4 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index d5cdab2..4f30950 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -191,32 +191,6 @@ int qmp_pc_dimm_device_list(Object *obj, void *opaque)
 return 0;
 }
 
-ram_addr_t get_current_ram_size(void)
-{
-MemoryDeviceInfoList *info_list = NULL;
-MemoryDeviceInfoList **prev = _list;
-MemoryDeviceInfoList *info;
-ram_addr_t size = ram_size;
-
-qmp_pc_dimm_device_list(qdev_get_machine(), );
-for (info = info_list; info; info = info->next) {
-MemoryDeviceInfo *value = info->value;
-
-if (value) {
-switch (value->type) {
-case MEMORY_DEVICE_INFO_KIND_DIMM:
-size += value->u.dimm->size;
-break;
-default:
-break;
-}
-}
-}
-qapi_free_MemoryDeviceInfoList(info_list);
-
-return size;
-}
-
 static int pc_dimm_slot2bitmap(Object *obj, void *opaque)
 {
 unsigned long *bitmap = opaque;
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 9671635..6a4c4d2 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -294,6 +294,32 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, 
uint8_t *config_data)
 memcpy(config_data, , sizeof(struct virtio_balloon_config));
 }
 
+static ram_addr_t get_current_ram_size(void)
+{
+MemoryDeviceInfoList *info_list = NULL;
+MemoryDeviceInfoList **prev = _list;
+MemoryDeviceInfoList *info;
+ram_addr_t size = ram_size;
+
+qmp_pc_dimm_device_list(qdev_get_machine(), );
+for (info = info_list; info; info = info->next) {
+MemoryDeviceInfo *value = info->value;
+
+if (value) {
+switch (value->type) {
+case MEMORY_DEVICE_INFO_KIND_DIMM:
+size += value->u.dimm->size;
+break;
+default:
+break;
+}
+}
+}
+qapi_free_MemoryDeviceInfoList(info_list);
+
+return size;
+}
+
 static void virtio_balloon_set_config(VirtIODevice *vdev,
   const uint8_t *config_data)
 {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 85aa403..a0ad2ac 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -54,7 +54,6 @@ typedef uintptr_t ram_addr_t;
 #endif
 
 extern ram_addr_t ram_size;
-ram_addr_t get_current_ram_size(void);
 
 /* memory API */
 
diff --git a/stubs/qmp_pc_dimm_device_list.c b/stubs/qmp_pc_dimm_device_list.c
index b584bd8..5cb220c 100644
--- a/stubs/qmp_pc_dimm_device_list.c
+++ b/stubs/qmp_pc_dimm_device_list.c
@@ -5,8 +5,3 @@ int qmp_pc_dimm_device_list(Object *obj, void *opaque)
 {
return 0;
 }
-
-ram_addr_t get_current_ram_size(void)
-{
-return ram_size;
-}
-- 
1.8.3.1




Re: [Qemu-devel] virtio-scsi/blk dataplane and guest memory allocation

2016-01-27 Thread Roy Shterman
Tried it again,

Important to understand that after modifying and saving configuration of
xml with virsh edit $name_of_guest

when i reenter the xml i can't see the iothread configuration in there.
don't understand why.

Also, after add what you suggested I checked with "info qtree" command on
the guest looking for sign of data-plane is working,

this is the output for info qtree:

  dev: virtio-blk-pci, id "virtio-disk1"
class = 0 (0x0)
ioeventfd = true
vectors = 2 (0x2)
virtio-pci-bus-master-bug-migration = false
disable-legacy = false
disable-modern = true
migrate-extra = false
modern-pio-notify = false
x-disable-pcie = false
addr = 07.0
romfile = ""
rombar = 1 (0x1)
multifunction = false
command_serr_enable = true
class SCSI controller, addr 00:07.0, pci id 1af4:1001 (sub
1af4:0002)
bar 0: i/o at 0xc0c0 [0xc0ff]
bar 1: mem at 0xfebf3000 [0xfebf3fff]
bus: virtio-bus
  type virtio-pci-bus
  dev: virtio-blk-device, id ""
drive = "drive-virtio-disk1"
logical_block_size = 512 (0x200)
physical_block_size = 512 (0x200)
min_io_size = 0 (0x0)
opt_io_size = 0 (0x0)
discard_granularity = 4294967295 (0x)
cyls = 16383 (0x3fff)
heads = 16 (0x10)
secs = 63 (0x3f)
serial = ""
config-wce = true
scsi = true
request-merging = true
indirect_desc = true
event_idx = true
notify_on_empty = true
any_layout = false
  dev: virtio-blk-pci, id "virtio-disk0"
class = 0 (0x0)
ioeventfd = true
vectors = 2 (0x2)
virtio-pci-bus-master-bug-migration = false
disable-legacy = false
disable-modern = true
migrate-extra = false
modern-pio-notify = false
x-disable-pcie = false
addr = 04.0
romfile = ""
rombar = 1 (0x1)
multifunction = false
command_serr_enable = true
class SCSI controller, addr 00:04.0, pci id 1af4:1001 (sub
1af4:0002)
bar 0: i/o at 0xc040 [0xc07f]
bar 1: mem at 0xfebf1000 [0xfebf1fff]
bus: virtio-bus
  type virtio-pci-bus
  dev: virtio-blk-device, id ""
drive = "drive-virtio-disk0"
logical_block_size = 512 (0x200)
physical_block_size = 512 (0x200)
min_io_size = 0 (0x0)
opt_io_size = 0 (0x0)
discard_granularity = 4294967295 (0x)
cyls = 16383 (0x3fff)
heads = 16 (0x10)
secs = 63 (0x3f)
serial = ""
config-wce = true
scsi = false
request-merging = true
indirect_desc = true
event_idx = true
notify_on_empty = true
any_layout = false
  dev: virtio-scsi-pci, id "scsi1"
ioeventfd = true
vectors = 4 (0x4)
virtio-pci-bus-master-bug-migration = false
disable-legacy = false
disable-modern = true
migrate-extra = false
modern-pio-notify = false
x-disable-pcie = false
addr = 08.0
romfile = ""
rombar = 1 (0x1)
multifunction = false
command_serr_enable = true
class SCSI controller, addr 00:08.0, pci id 1af4:1004 (sub
1af4:0008)
bar 0: i/o at 0xc100 [0xc13f]
bar 1: mem at 0xfebf4000 [0xfebf4fff]
bus: virtio-bus
  type virtio-pci-bus
  dev: virtio-scsi-device, id ""
num_queues = 1 (0x1)
max_sectors = 65535 (0x)
cmd_per_lun = 128 (0x80)
hotplug = true
param_change = true
indirect_desc = true
event_idx = true
notify_on_empty = true
any_layout = true
bus: scsi1.0
  type SCSI
  dev: scsi-block, id "scsi1-0-0-0"
drive = "drive-scsi1-0-0-0"
channel = 0 (0x0)
scsi-id = 0 (0x0)
lun = 0 (0x0)

Thnaks,
Roy



On Wed, Jan 27, 2016 at 9:03 PM, Roy Shterman 
wrote:

> Hi,
>
> First of all thank very much for your help,
>
> Second, unfortunately data-plane didn't worked well, I tried to add
> threads from the instructions you gave me.
>
> Here is my full xml file, maybe you can help me to understand why it
> didn't worked :
>
> 
>   gen-r-vrt-105-007-RH7.0x64
>   8f79e97e-d452-4577-82bd-2ed903773026
>   2097152
>   2097152
>   
> 8388608
>   
>   
> 
>   
>   2
>   
> hvm
> 
>   
>   
> 
> 
> 
>   
>   
>   destroy
>   restart
>   restart
>   
>
> /.autodirect/mtrswgwork/roysh/git/qemu/x86_64-softmmu/qemu-system-x86_64
> 
>   
>   
>   
>function='0x0'/>
> 
> 
>   
>   

Re: [Qemu-devel] virtio-scsi/blk dataplane and guest memory allocation

2016-01-27 Thread Fam Zheng
On Thu, 01/28 09:28, Roy Shterman wrote:
> Tried it again,
> 
> Important to understand that after modifying and saving configuration of
> xml with virsh edit $name_of_guest
> 
> when i reenter the xml i can't see the iothread configuration in there.
> don't understand why.
> 
> Also, after add what you suggested I checked with "info qtree" command on
> the guest looking for sign of data-plane is working,

Currently "info qtree" won't reflect that information. You'll need to use ps to
see if the command line includes "iothread" parts as I suggested in the first
reply.

Fam



[Qemu-devel] [PATCH 0/2] blockjob: Fix dead loop with block_job_finish_sync on dataplane disks

2016-01-27 Thread Fam Zheng
I noticed this bug when reviewing Max's bdrv_close_all() series, so here goes
the fix.


Fam Zheng (2):
  blockjob: Rename block_job_defer_to_main_loop
  blockjob: Fix hang in block_job_finish_sync

 block/backup.c|  2 +-
 block/commit.c|  2 +-
 block/mirror.c|  2 +-
 block/stream.c|  2 +-
 blockjob.c|  8 +---
 include/block/blockjob.h  | 14 +-
 tests/test-blockjob-txn.c |  2 +-
 7 files changed, 19 insertions(+), 13 deletions(-)

-- 
2.4.3




[Qemu-devel] [PATCH 1/2] blockjob: Rename block_job_defer_to_main_loop

2016-01-27 Thread Fam Zheng
The next patch will make this function more restrictive than it is now,
rename it and update comment to reflect the change.

Signed-off-by: Fam Zheng 
---
 block/backup.c|  2 +-
 block/commit.c|  2 +-
 block/mirror.c|  2 +-
 block/stream.c|  2 +-
 blockjob.c|  4 ++--
 include/block/blockjob.h  | 12 +++-
 tests/test-blockjob-txn.c |  2 +-
 7 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 00cafdb..b429666 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -482,7 +482,7 @@ static void coroutine_fn backup_run(void *opaque)
 
 data = g_malloc(sizeof(*data));
 data->ret = ret;
-block_job_defer_to_main_loop(>common, backup_complete, data);
+block_job_coroutine_complete(>common, backup_complete, data);
 }
 
 void backup_start(BlockDriverState *bs, BlockDriverState *target,
diff --git a/block/commit.c b/block/commit.c
index 446a3ae..f6b93bd 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -181,7 +181,7 @@ out:
 
 data = g_malloc(sizeof(*data));
 data->ret = ret;
-block_job_defer_to_main_loop(>common, commit_complete, data);
+block_job_coroutine_complete(>common, commit_complete, data);
 }
 
 static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
diff --git a/block/mirror.c b/block/mirror.c
index e9e151c..d665f2b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -626,7 +626,7 @@ immediate_exit:
 /* Before we switch to target in mirror_exit, make sure data doesn't
  * change. */
 bdrv_drained_begin(s->common.bs);
-block_job_defer_to_main_loop(>common, mirror_exit, data);
+block_job_coroutine_complete(>common, mirror_exit, data);
 }
 
 static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
diff --git a/block/stream.c b/block/stream.c
index cafaa07..9572ce3 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -194,7 +194,7 @@ wait:
 data = g_malloc(sizeof(*data));
 data->ret = ret;
 data->reached_end = sector_num == end;
-block_job_defer_to_main_loop(>common, stream_complete, data);
+block_job_coroutine_complete(>common, stream_complete, data);
 }
 
 static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp)
diff --git a/blockjob.c b/blockjob.c
index 80adb9d..4b16720 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -471,7 +471,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
 
 qemu_bh_delete(data->bh);
 
-/* Prevent race with block_job_defer_to_main_loop() */
+/* Prevent race with block_job_coroutine_complete() */
 aio_context_acquire(data->aio_context);
 
 /* Fetch BDS AioContext again, in case it has changed */
@@ -487,7 +487,7 @@ static void block_job_defer_to_main_loop_bh(void *opaque)
 g_free(data);
 }
 
-void block_job_defer_to_main_loop(BlockJob *job,
+void block_job_coroutine_complete(BlockJob *job,
   BlockJobDeferToMainLoopFn *fn,
   void *opaque)
 {
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index d84ccd8..de59fc2 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -393,18 +393,20 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
BlockDriverState *bs,
 typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
 
 /**
- * block_job_defer_to_main_loop:
+ * block_job_coroutine_complete:
  * @job: The job
  * @fn: The function to run in the main loop
  * @opaque: The opaque value that is passed to @fn
  *
- * Execute a given function in the main loop with the BlockDriverState
- * AioContext acquired.  Block jobs must call bdrv_unref(), bdrv_close(), and
- * anything that uses bdrv_drain_all() in the main loop.
+ * Complete the block job coroutine and execute a given function in the main
+ * loop with the BlockDriverState AioContext acquired.  Block jobs must call
+ * bdrv_unref(), bdrv_close(), and anything that uses bdrv_drain_all() in the
+ * main loop. After calling this, the block job coroutine should complete right
+ * away, without doing any heavy operations such as I/O or block_job_yield().
  *
  * The @job AioContext is held while @fn executes.
  */
-void block_job_defer_to_main_loop(BlockJob *job,
+void block_job_coroutine_complete(BlockJob *job,
   BlockJobDeferToMainLoopFn *fn,
   void *opaque);
 
diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
index 34747e9..56442f2 100644
--- a/tests/test-blockjob-txn.c
+++ b/tests/test-blockjob-txn.c
@@ -57,7 +57,7 @@ static void coroutine_fn test_block_job_run(void *opaque)
 }
 }
 
-block_job_defer_to_main_loop(job, test_block_job_complete,
+block_job_coroutine_complete(job, test_block_job_complete,
  (void *)(intptr_t)s->rc);
 }
 
-- 
2.4.3




[Qemu-devel] [PATCH 2/2] blockjob: Fix hang in block_job_finish_sync

2016-01-27 Thread Fam Zheng
With a mirror job running on a virtio-blk dataplane disk, sending "q" to
HMP will cause a dead loop in block_job_finish_sync.

This is because the aio_poll() only processes the AIO context of bs
which has no more work to do, while the main loop BH that is scheduled
for setting the job->completed flag is never processed.

Fix this by adding a "ctx" pointer in BlockJob structure, to track which
context to poll for the block job to make progress. Its value is set to
the BDS context at block job creation, until
block_job_coroutine_complete() is called by the block job coroutine.
After that point, the block job's work is deferred to main loop BH.

Signed-off-by: Fam Zheng 
---
 blockjob.c   | 4 +++-
 include/block/blockjob.h | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index 4b16720..4ea1ce0 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -74,6 +74,7 @@ void *block_job_create(const BlockJobDriver *driver, 
BlockDriverState *bs,
 job->opaque= opaque;
 job->busy  = true;
 job->refcnt= 1;
+job->ctx   = bdrv_get_aio_context(bs);
 bs->job = job;
 
 /* Only set speed when necessary to avoid NotSupported error */
@@ -304,7 +305,7 @@ static int block_job_finish_sync(BlockJob *job,
 return -EBUSY;
 }
 while (!job->completed) {
-aio_poll(bdrv_get_aio_context(bs), true);
+aio_poll(job->ctx, true);
 }
 ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
 block_job_unref(job);
@@ -497,6 +498,7 @@ void block_job_coroutine_complete(BlockJob *job,
 data->aio_context = bdrv_get_aio_context(job->bs);
 data->fn = fn;
 data->opaque = opaque;
+job->ctx = qemu_get_aio_context();
 
 qemu_bh_schedule(data->bh);
 }
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index de59fc2..5c6a884 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -92,6 +92,8 @@ struct BlockJob {
  */
 char *id;
 
+AioContext *ctx;
+
 /**
  * The coroutine that executes the job.  If not NULL, it is
  * reentered when busy is false and the job is cancelled.
-- 
2.4.3




Re: [Qemu-devel] [PATCH V2] net/traffic-mirror:Add traffic-mirror

2016-01-27 Thread Zhang Chen



On 01/28/2016 01:44 PM, Jason Wang wrote:


On 01/27/2016 10:40 AM, Zhang Chen wrote:

From: ZhangChen 

Traffic-mirror is a netfilter plugin.
It gives qemu the ability to copy and mirror guest's
net packet. we output packet to chardev.

usage:

-netdev tap,id=hn0
-chardev socket,id=mirror0,host=ip_primary,port=X,server,nowait
-traffic-mirror,id=m0,netdev=hn0,queue=tx/rx/all,outdev=mirror0

Signed-off-by: ZhangChen 
Signed-off-by: Wen Congyang 
Reviewed-by: Yang Hongyang 

Thanks for the patch. Several questions:

- I'm curious about how the patch was tested? Simple setup e.g:

-netdev tap,id=hn0 -device virtio-net-pci,netdev=hn0 -chardev
socket,id=c0,host=localhost,port=,server,nowait -object
traffic-mirror,netdev=hn0,outdev=c0,id=f0 -netdev
socket,id=s0,connect=127.0.0.1: -device e1000,netdev=s0

does not works for me.


I test it in this way.
primary:
-netdev tap,id=hn0 -device e1000,netdev=hn0 -chardev 
socket,id=mirror0,host=3.3.3.3,port=9003,server,nowait

 -object traffic-mirror,id=f0,netdev=hn0,queue=tx,outdev=mirror0

secondary:
-netdev tap,id=hn0 -device e1000,netdev=hn0 -chardev 
socket,id=mirror0,host=3.3.3.3,port=9003 -object 
traffic-reader,id=f1,netdev=hn0,queue=rx,indev=mirror0


I write a traffic-reader demo to read chardev socket and print it in 
monitor.





- Is a reliable mirroring (e.g no packet drops during mirroring) is
needed for COLO? If yes, this patch seems could not guarantee this.


I will fix it in V3


- Please consider to write a unit test for this patch.


write a unit test like tests/test-netfilter.c ?


And see comments below.

Thanks



---
  net/Makefile.objs|   1 +
  net/traffic-mirror.c | 173 +++
  qemu-options.hx  |   5 ++
  vl.c |   3 +-
  4 files changed, 181 insertions(+), 1 deletion(-)
  create mode 100644 net/traffic-mirror.c

diff --git a/net/Makefile.objs b/net/Makefile.objs
index 5fa2f97..de06ebe 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -15,3 +15,4 @@ common-obj-$(CONFIG_VDE) += vde.o
  common-obj-$(CONFIG_NETMAP) += netmap.o
  common-obj-y += filter.o
  common-obj-y += filter-buffer.o
+common-obj-y += traffic-mirror.o

Let's s/traffic-mirror/filter-mirror/g to be consistent with other filters.



OK~ I will fix it in V3


diff --git a/net/traffic-mirror.c b/net/traffic-mirror.c
new file mode 100644
index 000..bed915c
--- /dev/null
+++ b/net/traffic-mirror.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "net/filter.h"
+#include "net/net.h"
+#include "qemu-common.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi-visit.h"
+#include "qom/object.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "sysemu/char.h"
+#include "qemu/iov.h"
+
+#define FILTER_TRAFFIC_MIRROR(obj) \
+OBJECT_CHECK(MirrorState, (obj), TYPE_FILTER_TRAFFIC_MIRROR)
+
+#define TYPE_FILTER_TRAFFIC_MIRROR "traffic-mirror"
+
+typedef struct MirrorState {
+NetFilterState parent_obj;
+char *outdev;
+CharDriverState *chr_out;
+
+} MirrorState;
+
+static ssize_t traffic_mirror_send(NetFilterState *nf,
+   const struct iovec *iov,
+   int iovcnt)
+{
+MirrorState *s = FILTER_TRAFFIC_MIRROR(nf);
+ssize_t ret = 0;
+ssize_t size = 0;
+char *buf;
+
+size = iov_size(iov, iovcnt);
+if (!size) {
+return 0;
+}
+
+buf = g_malloc0(size);
+iov_to_buf(iov, iovcnt, 0, buf, size);
+ret = qemu_chr_fe_write(s->chr_out, (uint8_t *), sizeof(size));

htonl(size)?


We do not need this.


+if (ret < 0) {

This check is not sufficient, for some reason, only part of the packets
maybe sent by the socket. Need to handle this properly, otherwise it may
confuse receiver.


I will fix it in next version.


+g_free(buf);
+return ret;
+}
+
+ret = qemu_chr_fe_write(s->chr_out, (uint8_t *)buf, size);
+g_free(buf);
+return ret;

Ditto.


I will fix it in next version.


+}
+
+static ssize_t traffic_mirror_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+/*
+ * We copy and mirror packet to outdev,
+ * then put back the packet.
+ */

The code could explain itself, so the comment is unnecessary.



  1   2   3   >