date:20190620

Re: [Qemu-devel] [PATCH v4 13/13] vfio: Add trace events in migration code path

2019-06-20 Thread Kirti Wankhede




On 6/21/2019 12:20 AM, Dr. David Alan Gilbert wrote:
> * Kirti Wankhede (kwankh...@nvidia.com) wrote:
>> Signed-off-by: Kirti Wankhede 
>> Reviewed-by: Neo Jia 
> 
> Thanks, adding traces really helps; however, it might be easier
> if you just add them in your previous patches where you're
> adding the functions.
> 

Ok. I'll change it.

Thanks,
Kirti

> Dave
> 
>> ---
>>  hw/vfio/migration.c  | 26 ++
>>  hw/vfio/trace-events | 18 ++
>>  2 files changed, 44 insertions(+)
>>
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
>> index 68775b5dec11..70c03f1a969f 100644
>> --- a/hw/vfio/migration.c
>> +++ b/hw/vfio/migration.c
>> @@ -21,6 +21,7 @@
>>  #include "exec/ramlist.h"
>>  #include "exec/ram_addr.h"
>>  #include "pci.h"
>> +#include "trace.h"
>>  
>>  /*
>>   * Flags used as delimiter:
>> @@ -104,6 +105,7 @@ static int vfio_migration_set_state(VFIODevice 
>> *vbasedev, uint32_t state)
>>  }
>>  
>>  vbasedev->device_state = state;
>> +trace_vfio_migration_set_state(vbasedev->name, state);
>>  return 0;
>>  }
>>  
>> @@ -173,6 +175,8 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice 
>> *vbasedev)
>>  qemu_put_be64(f, data_size);
>>  }
>>  
>> +trace_vfio_save_buffer(vbasedev->name, data_offset, data_size,
>> +   migration->pending_bytes);
>>  ret = qemu_file_get_error(f);
>>  
>>  return data_size;
>> @@ -195,6 +199,7 @@ static int vfio_update_pending(VFIODevice *vbasedev)
>>  }
>>  
>>  migration->pending_bytes = pending_bytes;
>> +trace_vfio_update_pending(vbasedev->name, pending_bytes);
>>  return 0;
>>  }
>>  
>> @@ -209,6 +214,8 @@ static int vfio_save_device_config_state(QEMUFile *f, 
>> void *opaque)
>>  }
>>  qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>>  
>> +trace_vfio_save_device_config_state(vbasedev->name);
>> +
>>  return qemu_file_get_error(f);
>>  }
>>  
>> @@ -225,6 +232,7 @@ static int vfio_load_device_config_state(QEMUFile *f, 
>> void *opaque)
>>  return -EINVAL;
>>  }
>>  
>> +trace_vfio_load_device_config_state(vbasedev->name);
>>  return qemu_file_get_error(f);
>>  }
>>  
>> @@ -343,6 +351,9 @@ void vfio_get_dirty_page_list(VFIODevice *vbasedev,
>>  }
>>  } while (count < pfn_count);
>>  
>> +trace_vfio_get_dirty_page_list(vbasedev->name, start_pfn, pfn_count,
>> +   page_size);
>> +
>>  dpl_unlock:
>>  qemu_mutex_unlock(>lock);
>>  }
>> @@ -390,6 +401,7 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
>>  return ret;
>>  }
>>  
>> +trace_vfio_save_setup(vbasedev->name);
>>  return 0;
>>  }
>>  
>> @@ -401,6 +413,7 @@ static void vfio_save_cleanup(void *opaque)
>>  if (migration->region.buffer.mmaps) {
>>  vfio_region_unmap(>region.buffer);
>>  }
>> +trace_vfio_cleanup(vbasedev->name);
>>  }
>>  
>>  static void vfio_save_pending(QEMUFile *f, void *opaque,
>> @@ -424,6 +437,7 @@ static void vfio_save_pending(QEMUFile *f, void *opaque,
>>  *res_postcopy_only += migration->pending_bytes;
>>  }
>>  *res_compatible += 0;
>> +trace_vfio_save_pending(vbasedev->name);
>>  }
>>  
>>  static int vfio_save_iterate(QEMUFile *f, void *opaque)
>> @@ -451,6 +465,7 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>>  return ret;
>>  }
>>  
>> +trace_vfio_save_iterate(vbasedev->name);
>>  return ret;
>>  }
>>  
>> @@ -504,6 +519,8 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
>> *opaque)
>>  error_report("Failed to set state STOPPED");
>>  return ret;
>>  }
>> +
>> +trace_vfio_save_complete_precopy(vbasedev->name);
>>  return ret;
>>  }
>>  
>> @@ -544,6 +561,9 @@ static int vfio_load_state(QEMUFile *f, void *opaque, 
>> int version_id)
>>  
>>  data = qemu_get_be64(f);
>>  while (data != VFIO_MIG_FLAG_END_OF_STATE) {
>> +
>> +trace_vfio_load_state(vbasedev->name, data);
>> +
>>  switch (data) {
>>  case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
>>  {
>> @@ -627,6 +647,8 @@ static int vfio_load_state(QEMUFile *f, void *opaque, 
>> int version_id)
>>  return -EINVAL;
>>  }
>>  }
>> +trace_vfio_load_state_device_data(vbasedev->name, data_offset,
>> +  data_size);
>>  break;
>>  }
>>  }
>> @@ -668,6 +690,7 @@ static void vfio_vmstate_change(void *opaque, int 
>> running, RunState state)
>>  }
>>  
>>  vbasedev->vm_running = running;
>> +trace_vfio_vmstate_change(vbasedev->name, running);
>>  }
>>  
>>  static void vfio_migration_state_notifier(Notifier *notifier, void *data)
>> @@ -676,6 +699,8 @@ static void vfio_migration_state_notifier(Notifier 
>> *notifier, void *data)
>>  VFIODevice *vbasedev = container_of(notifier, VFIODevice, 
>> migration_state);
>>

Re: [Qemu-devel] [PATCH v4 01/13] vfio: KABI for migration interface

2019-06-20 Thread Kirti Wankhede




On 6/20/2019 10:48 PM, Alex Williamson wrote:
> On Thu, 20 Jun 2019 20:07:29 +0530
> Kirti Wankhede  wrote:
> 
>> - Defined MIGRATION region type and sub-type.
>> - Used 3 bits to define VFIO device states.
>> Bit 0 => _RUNNING
>> Bit 1 => _SAVING
>> Bit 2 => _RESUMING
>> Combination of these bits defines VFIO device's state during migration
>> _STOPPED => All bits 0 indicates VFIO device stopped.
>> _RUNNING => Normal VFIO device running state.
>> _SAVING | _RUNNING => vCPUs are running, VFIO device is running but start
>>   saving state of device i.e. pre-copy state
>> _SAVING  => vCPUs are stoppped, VFIO device should be stopped, and
>>   save device state,i.e. stop-n-copy state
>> _RESUMING => VFIO device resuming state.
>> _SAVING | _RESUMING => Invalid state if _SAVING and _RESUMING bits are 
>> set
>> - Defined vfio_device_migration_info structure which will be placed at 0th
>>   offset of migration region to get/set VFIO device related information.
>>   Defined members of structure and usage on read/write access:
>> * device_state: (read/write)
>> To convey VFIO device state to be transitioned to. Only 3 bits are 
>> used
>> as of now.
>> * pending bytes: (read only)
>> To get pending bytes yet to be migrated for VFIO device.
>> * data_offset: (read only)
>> To get data offset in migration from where data exist during _SAVING
>> and from where data should be written by user space application 
>> during
>>  _RESUMING state
>> * data_size: (read/write)
>> To get and set size of data copied in migration region during _SAVING
>> and _RESUMING state.
>> * start_pfn, page_size, total_pfns: (write only)
>> To get bitmap of dirty pages from vendor driver from given
>> start address for total_pfns.
>> * copied_pfns: (read only)
>> To get number of pfns bitmap copied in migration region.
>> Vendor driver should copy the bitmap with bits set only for
>> pages to be marked dirty in migration region. Vendor driver
>> should return 0 if there are 0 pages dirty in requested
>> range. Vendor driver should return -1 to mark all pages in the 
>> section
>> as dirty
>>
>> Migration region looks like:
>>  --
>> |vfio_device_migration_info|data section  |
>> |  | ///  |
>>  --
>>  ^  ^  ^
>>  offset 0-trapped partdata_offset data_size
>>
>> Data section is always followed by vfio_device_migration_info
>> structure in the region, so data_offset will always be none-0.
>> Offset from where data is copied is decided by kernel driver, data
>> section can be trapped or mapped depending on how kernel driver
>> defines data section. If mmapped, then data_offset should be page
>> aligned, where as initial section which contain
>> vfio_device_migration_info structure might not end at offset which
>> is page aligned.
>>
>> Signed-off-by: Kirti Wankhede 
>> Reviewed-by: Neo Jia 
>> ---
>>  linux-headers/linux/vfio.h | 71 
>> ++
>>  1 file changed, 71 insertions(+)
>>
>> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
>> index 24f505199f83..274ec477eb82 100644
>> --- a/linux-headers/linux/vfio.h
>> +++ b/linux-headers/linux/vfio.h
>> @@ -372,6 +372,77 @@ struct vfio_region_gfx_edid {
>>   */
>>  #define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD(1)
>>  
>> +/* Migration region type and sub-type */
>> +#define VFIO_REGION_TYPE_MIGRATION  (2)
>> +#define VFIO_REGION_SUBTYPE_MIGRATION   (1)
>> +
>> +/**
>> + * Structure vfio_device_migration_info is placed at 0th offset of
>> + * VFIO_REGION_SUBTYPE_MIGRATION region to get/set VFIO device related 
>> migration
>> + * information. Field accesses from this structure are only supported at 
>> their
>> + * native width and alignment, otherwise should return error.
>> + *
>> + * device_state: (read/write)
>> + *  To indicate vendor driver the state VFIO device should be 
>> transitioned
>> + *  to. If device state transition fails, write to this field return 
>> error.
>> + *  It consists of 3 bits:
>> + *  - If bit 0 set, indicates _RUNNING state. When its reset, that 
>> indicates
>> + *_STOPPED state. When device is changed to _STOPPED, driver should 
>> stop
>> + *device before write returns.
>> + *  - If bit 1 set, indicates _SAVING state.
>> + *  - If bit 2 set, indicates _RESUMING state.
>> + *
>> + * pending bytes: (read only)
>> + *  Read pending bytes yet to be migrated from vendor driver
>> + *
>> + * data_offset: (read only)
>>

Re: [Qemu-devel] [Qemu-riscv] [RFC v1 4/5] roms: Add OpenSBI version 0.3

2019-06-20 Thread Bin Meng

On Thu, Jun 20, 2019 at 2:30 AM Alistair Francis  wrote:
>
> On Wed, Jun 19, 2019 at 8:18 AM Bin Meng  wrote:
> >
> > On Wed, Jun 19, 2019 at 1:14 PM Anup Patel  wrote:
> > >
> > > On Wed, Jun 19, 2019 at 6:24 AM Alistair Francis
> > >  wrote:
> > > >
> > > > Add OpenSBI version 0.3 as a git submodule and as a prebult binary.
> > > >
> > > > Signed-off-by: Alistair Francis 
> > > > ---
> > > >  .gitmodules |   3 +++
> > > >  Makefile|   3 ++-
> > > >  configure   |   1 +
> > > >  pc-bios/opensbi-riscv32-fw_jump.elf | Bin 0 -> 197988 bytes
> > > >  pc-bios/opensbi-riscv64-fw_jump.elf | Bin 0 -> 200192 bytes
> > > >  roms/Makefile   |  17 +
> > > >  roms/opensbi|   1 +
> > > >  7 files changed, 24 insertions(+), 1 deletion(-)
> > > >  create mode 100644 pc-bios/opensbi-riscv32-fw_jump.elf
> > > >  create mode 100644 pc-bios/opensbi-riscv64-fw_jump.elf
> > > >  create mode 16 roms/opensbi
> > > >
> > >
> > > The OpenSBI firmwares are platform specific so we should have
> > > machine directory under pc-bios/ directory
> > >
> > > So for virt machine we will have:
> > > pc-bios/riscv32/virt/fw_jump.elf
> > > pc-bios/riscv64/virt/fw_jump.elf
>
> I have updated the names to indicate the machine. The pc-bios directly
> appears to be flat (at least for binaries) so I don't want to add
> subdirectories.
>

Should we include pre-built OpenSBI "bios" for "sifive_u" machine too?

> >
> > And we should only integrate plain binary image for "bios" images here.
> >
> > pc-bios/riscv32/virt/fw_jump.bin
> > pc-bios/riscv64/virt/fw_jump.bin
>
> Yep, fixed.
>

Regards,
Bin

Re: [Qemu-devel] [PATCH] riscv: sifive_test: Add reset functionality

2019-06-20 Thread Bin Meng

Hi Palmer,

On Fri, Jun 21, 2019 at 10:53 AM Palmer Dabbelt  wrote:
>
> On Wed, 19 Jun 2019 06:42:21 PDT (-0700), bmeng...@gmail.com wrote:
> > Hi Alistair,
> >
> > On Tue, Jun 18, 2019 at 1:15 AM Alistair Francis  
> > wrote:
> >>
> >> On Fri, Jun 14, 2019 at 8:30 AM Bin Meng  wrote:
> >> >
> >> > This adds a reset opcode for sifive_test device to trigger a system
> >> > reset for testing purpose.
> >> >
> >> > Signed-off-by: Bin Meng 
> >> > ---
> >> >
> >> >  hw/riscv/sifive_test.c | 4 
> >> >  include/hw/riscv/sifive_test.h | 3 ++-
> >> >  2 files changed, 6 insertions(+), 1 deletion(-)
> >> >
> >> > diff --git a/hw/riscv/sifive_test.c b/hw/riscv/sifive_test.c
> >> > index 24a04d7..cd86831 100644
> >> > --- a/hw/riscv/sifive_test.c
> >> > +++ b/hw/riscv/sifive_test.c
> >> > @@ -21,6 +21,7 @@
> >> >  #include "qemu/osdep.h"
> >> >  #include "hw/sysbus.h"
> >> >  #include "qemu/module.h"
> >> > +#include "sysemu/sysemu.h"
> >> >  #include "target/riscv/cpu.h"
> >> >  #include "hw/riscv/sifive_test.h"
> >> >
> >> > @@ -40,6 +41,9 @@ static void sifive_test_write(void *opaque, hwaddr 
> >> > addr,
> >> >  exit(code);
> >> >  case FINISHER_PASS:
> >> >  exit(0);
> >> > +case FINISHER_RESET:
> >> > +qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
> >> > +return;
> >> >  default:
> >> >  break;
> >> >  }
> >> > diff --git a/include/hw/riscv/sifive_test.h 
> >> > b/include/hw/riscv/sifive_test.h
> >> > index 71d4c9f..c186a31 100644
> >> > --- a/include/hw/riscv/sifive_test.h
> >> > +++ b/include/hw/riscv/sifive_test.h
> >> > @@ -34,7 +34,8 @@ typedef struct SiFiveTestState {
> >> >
> >> >  enum {
> >> >  FINISHER_FAIL = 0x,
> >> > -FINISHER_PASS = 0x
> >> > +FINISHER_PASS = 0x,
> >> > +FINISHER_RESET = 0x
> >>
> >> Do you mind sharing where you got this value from? I can't find
> >> details on this device in the SiFive manuals.
> >>
> >
> > I don't think this is a device that actually exists on SiFive's
> > chipset. It's hypothetical.
>
> The device actually does exist in the hardware, but that's just an
> implementation quirk.  Essentially what's going on here is that the RTL
> contains this device, which has a register and then a behavioral verilog block
> that causes simulations to terminate.  This is how we exit from tests in RTL
> simulation, and we've just gone ahead and implemented the same device in QEMU
> in order to make it easy to have compatibility with those bare-metal tests.
> Due to how our design flow is set up we end up with exactly the same block in
> the ASIC.  The register is still there, but the behavioral code to exit
> simulations doesn't do anything so it's essentially just a useless device.
> Since it's useless we don't bother writing it up in the ASIC documentation, 
> but
> it should be in the RTL documentation.
>
> I'm not opposed to extending the interface in the suggested fashion, but I
> wanted to check with the hardware team first to see if they're doing anything
> with the other numbers.  I'm out of the office (and somewhat backed up on code
> review) until July, so it might take a bit to dig through this.

Thanks for the clarification. The main reason of adding this
functionality is to provide software a way of rebooting the whole
system. Please provide update after you consult SiFive hardware guys
:)

Regards,
Bin

Re: [Qemu-devel] [PATCH] riscv: sifive_test: Add reset functionality

2019-06-20 Thread Palmer Dabbelt

On Wed, 19 Jun 2019 06:42:21 PDT (-0700), bmeng...@gmail.com wrote:

Hi Alistair,

On Tue, Jun 18, 2019 at 1:15 AM Alistair Francis  wrote:

On Fri, Jun 14, 2019 at 8:30 AM Bin Meng  wrote:
>
> This adds a reset opcode for sifive_test device to trigger a system
> reset for testing purpose.
>
> Signed-off-by: Bin Meng 
> ---
>
>  hw/riscv/sifive_test.c | 4 
>  include/hw/riscv/sifive_test.h | 3 ++-
>  2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/hw/riscv/sifive_test.c b/hw/riscv/sifive_test.c
> index 24a04d7..cd86831 100644
> --- a/hw/riscv/sifive_test.c
> +++ b/hw/riscv/sifive_test.c
> @@ -21,6 +21,7 @@
>  #include "qemu/osdep.h"
>  #include "hw/sysbus.h"
>  #include "qemu/module.h"
> +#include "sysemu/sysemu.h"
>  #include "target/riscv/cpu.h"
>  #include "hw/riscv/sifive_test.h"
>
> @@ -40,6 +41,9 @@ static void sifive_test_write(void *opaque, hwaddr addr,
>  exit(code);
>  case FINISHER_PASS:
>  exit(0);
> +case FINISHER_RESET:
> +qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
> +return;
>  default:
>  break;
>  }
> diff --git a/include/hw/riscv/sifive_test.h b/include/hw/riscv/sifive_test.h
> index 71d4c9f..c186a31 100644
> --- a/include/hw/riscv/sifive_test.h
> +++ b/include/hw/riscv/sifive_test.h
> @@ -34,7 +34,8 @@ typedef struct SiFiveTestState {
>
>  enum {
>  FINISHER_FAIL = 0x,
> -FINISHER_PASS = 0x
> +FINISHER_PASS = 0x,
> +FINISHER_RESET = 0x

Do you mind sharing where you got this value from? I can't find
details on this device in the SiFive manuals.

I don't think this is a device that actually exists on SiFive's
chipset. It's hypothetical.

The device actually does exist in the hardware, but that's just an
implementation quirk.  Essentially what's going on here is that the RTL
contains this device, which has a register and then a behavioral verilog block
that causes simulations to terminate.  This is how we exit from tests in RTL
simulation, and we've just gone ahead and implemented the same device in QEMU
in order to make it easy to have compatibility with those bare-metal tests.
Due to how our design flow is set up we end up with exactly the same block in
the ASIC.  The register is still there, but the behavioral code to exit
simulations doesn't do anything so it's essentially just a useless device.
Since it's useless we don't bother writing it up in the ASIC documentation, but
it should be in the RTL documentation.

I'm not opposed to extending the interface in the suggested fashion, but I
wanted to check with the hardware team first to see if they're doing anything
with the other numbers.  I'm out of the office (and somewhat backed up on code
review) until July, so it might take a bit to dig through this.

Re: [Qemu-devel] [PATCH v1 0/9] Update the RISC-V specification versions

2019-06-20 Thread Palmer Dabbelt


On Wed, 19 Jun 2019 07:19:38 PDT (-0700), alistai...@gmail.com wrote:

On Wed, Jun 19, 2019 at 3:58 AM Palmer Dabbelt  wrote:


On Mon, 17 Jun 2019 18:31:00 PDT (-0700), Alistair Francis wrote:
> Based-on: 
>
> Now that the RISC-V spec has started to be ratified let's update our
> QEMU implementation. There are a few things going on here:
>  - Add priv version 1.11.0 to QEMU
> - This is the ratified version of the Privledge spec
> - There are almost no changes to 1.10
>  - Mark the 1.09.1 privledge spec as depreated
>  - Let's aim to remove it in two releases
>  - Set priv version 1.11.0 as the default
>  - Remove the user_spec version
>  - This doesn't really mean anything so let's remove it
>  - Add support for the "Counters" extension
>  - Add command line options for Zifencei and Zicsr

Thanks!  I'll look at the code, but I've currently got this queued up behind
your hypervisor patches so it might take a bit.  LMK if you want me to invert
the priority on these.  I'll probably be buried until the start of July.


Let's move the Hypervisor patches to the back then. There is a new
spec version now anyway so I'll have to update them for that.


OK.  Do you want me to just drop them and wait for a v2 / draft 0.4?



Alistair



> We can remove the spec version as it's unused and has never been exposed
> to users. The idea is to match the specs in specifying the version. To
> handle versions in the future we can extend the extension props to
> handle version information.
>
> For example something like this: -cpu rv64,i=2.2,c=2.0,h=0.4,priv_spec=1.11
>
> NOTE: This isn't supported today as we only have one of each version.
>
> This will be a future change if we decide to support multiple versions
> of extensions.
>
> The "priv_spec" string doesn't really match, but I don't have a better
> way to say "Machine ISA" and "Supervisor ISA" which is what is included
> in "priv_spec".
>
> For completeness I have also added the Counters, Zifencei and Zicsr
> extensions.
>
> Everything else seems to match the spec names/style.
>
> Please let me know if I'm missing something. QEMU 4.1 is the first
> release to support the extensions from the command line, so we can
> easily change it until then. After that it'll take more work to change
> the command line interface.
>
> Alistair Francis (9):
>   target/riscv: Restructure deprecatd CPUs
>   target/riscv: Add the privledge spec version 1.11.0
>   target/riscv: Comment in the mcountinhibit CSR
>   target/riscv: Set privledge spec 1.11.0 as default
>   qemu-deprecated.texi: Deprecate the RISC-V privledge spec 1.09.1
>   target/riscv: Require either I or E base extension
>   target/riscv: Remove user version information
>   target/riscv: Add support for disabling/enabling Counters
>   target/riscv: Add Zifencei and Zicsr as command line options
>
>  qemu-deprecated.texi  |  8 +++
>  target/riscv/cpu.c| 72 ++-
>  target/riscv/cpu.h| 19 ++---
>  target/riscv/cpu_bits.h   |  1 +
>  target/riscv/csr.c| 13 +++-
>  .../riscv/insn_trans/trans_privileged.inc.c   |  2 +-
>  6 files changed, 71 insertions(+), 44 deletions(-)

Re: [Qemu-devel] [PATCH] memory: do not do out of bound notification

2019-06-20 Thread Peter Xu

On Thu, Jun 20, 2019 at 03:14:04PM +0200, Paolo Bonzini wrote:
> On 20/06/19 14:59, Peter Xu wrote:
> > I feel like this can be problematic.  I'm imaging:
> > 
> > start=0x1000_, size=0x1000_1000
> > 
> > This will get size=0x1000 but actually we can do size=0x1000_ as
> > the first.
> 
> Right, we can do:
> 
> /*
>  * If a naturally aligned region starting at "start" ends before "end",
>  * use it.  Otherwise, keep the lowest bit of size.
>  */
> if (size > (start & -start))
> size = start & -start;

May need to consider start==0, otherwise size will be zero here?

> else
> size = size & -size;

Should use MSB rather than LSB of size?

> 
> >>
> >> +trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
> >> + VTD_PCI_SLOT(as->devfn),
> >> + VTD_PCI_FUNC(as->devfn),
> >> + entry.iova, size);
> > 
> > Can move this out because this is a trace only so we don't have
> > restriction on mask?
> > 
> >>
> >> -map.iova = entry.iova;
> >> -map.size = entry.addr_mask;
> >> -iova_tree_remove(as->iova_tree, );
> >> +map.iova = entry.iova;
> >> +map.size = entry.addr_mask;
> >> +iova_tree_remove(as->iova_tree, );
> > 
> > Same here?
> > 
> 
> Yes, I would move these and the iova_tree_remove outside the loop, while
> keeping entry's initialization inside looks cleaner.

Yeah that's ok to me too.

Thanks,

-- 
Peter Xu

Re: [Qemu-devel] [PULL 00/25] Misc (mostly x86) patches for 2019-06-21

2019-06-20 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/1561081350-3723-1-git-send-email-pbonz...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PULL 00/25] Misc (mostly x86) patches for 2019-06-21
Type: series
Message-id: 1561081350-3723-1-git-send-email-pbonz...@redhat.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

From https://github.com/patchew-project/qemu
 * [new tag]   
patchew/1561081350-3723-1-git-send-email-pbonz...@redhat.com -> 
patchew/1561081350-3723-1-git-send-email-pbonz...@redhat.com
Switched to a new branch 'test'
40d8d2a6bd hw: Nuke hw_compat_4_0_1 and pc_compat_4_0_1
73fe0889f4 util/main-loop: Fix incorrect assertion
618fd44d79 sd: Fix out-of-bounds assertions
d3c8dae021 target/i386: kvm: Add nested migration blocker only when kernel 
lacks required capabilities
31293fcc83 target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD
c86b7f9ea7 target/i386: kvm: Add support for save and restore nested state
74af141e76 vmstate: Add support for kernel integer types
7fb006f6dc linux-headers: sync with latest KVM headers from Linux 5.2
b0e72dd5be target/i386: kvm: Block migration for vCPUs exposed with nested 
virtualization
6eb6ee733d target/i386: kvm: Re-inject #DB to guest with updated DR6
063779b466 target/i386: kvm: Use symbolic constant for #DB/#BP exception 
constants
0d5f1db3f3 KVM: Introduce kvm_arch_destroy_vcpu()
8410943162 target/i386: kvm: Delete VMX migration blocker on vCPU init failure
7d269c7b25 target/i386: define a new MSR based feature word - 
FEAT_CORE_CAPABILITY
48a660ddf1 i386/kvm: add support for Direct Mode for Hyper-V synthetic timers
4c286bb0f0 i386/kvm: hv-evmcs requires hv-vapic
b0a84ac0f3 i386/kvm: hv-tlbflush/ipi require hv-vpindex
67ae163697 i386/kvm: hv-stimer requires hv-time and hv-synic
c4235a20e3 i386/kvm: implement 'hv-passthrough' mode
9b41ec1de0 i386/kvm: document existing Hyper-V enlightenments
644d910def i386/kvm: move Hyper-V CPUID filling to hyperv_handle_properties()
cf3f383dc6 i386/kvm: add support for KVM_GET_SUPPORTED_HV_CPUID
03b1cb7339 i386/kvm: convert hyperv enlightenments properties from bools to bits
31a2b079aa hax: Honor CPUState::halted
4ff0d94d19 kvm-all: Add/update fprintf's for kvm_*_ioeventfd_del

=== OUTPUT BEGIN ===
1/25 Checking commit 4ff0d94d1917 (kvm-all: Add/update fprintf's for 
kvm_*_ioeventfd_del)
2/25 Checking commit 31a2b079aaf9 (hax: Honor CPUState::halted)
WARNING: Block comments use a leading /* on a separate line
#77: FILE: target/i386/hax-all.c:479:
+/* After a vcpu is halted (either because it is an AP and has just been

WARNING: Block comments use a leading /* on a separate line
#109: FILE: target/i386/hax-all.c:519:
+/* If this vcpu is halted, we must not ask HAXM to run it. Instead, we

total: 0 errors, 2 warnings, 60 lines checked

Patch 2/25 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
3/25 Checking commit 03b1cb733929 (i386/kvm: convert hyperv enlightenments 
properties from bools to bits)
4/25 Checking commit cf3f383dc652 (i386/kvm: add support for 
KVM_GET_SUPPORTED_HV_CPUID)
5/25 Checking commit 644d910def82 (i386/kvm: move Hyper-V CPUID filling to 
hyperv_handle_properties())
6/25 Checking commit 9b41ec1de0ee (i386/kvm: document existing Hyper-V 
enlightenments)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#18: 
new file mode 100644

total: 0 errors, 1 warnings, 181 lines checked

Patch 6/25 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
7/25 Checking commit c4235a20e312 (i386/kvm: implement 'hv-passthrough' mode)
8/25 Checking commit 67ae163697ff (i386/kvm: hv-stimer requires hv-time and 
hv-synic)
9/25 Checking commit b0a84ac0f3fa (i386/kvm: hv-tlbflush/ipi require hv-vpindex)
10/25 Checking commit 4c286bb0f0d4 (i386/kvm: hv-evmcs requires hv-vapic)
11/25 Checking commit 48a660ddf1ab (i386/kvm: add support for Direct Mode for 
Hyper-V synthetic timers)
12/25 Checking commit 7d269c7b257f (target/i386: define a new MSR based feature 
word - FEAT_CORE_CAPABILITY)
13/25 Checking commit 841094316237 (target/i386: kvm: Delete VMX migration 
blocker on vCPU init failure)
14/25 Checking commit 0d5f1db3f3b8 (KVM: Introduce kvm_arch_destroy_vcpu())
ERROR: code indent should never use tabs
#61: FILE: target/arm/kvm32.c:245:
+^Ireturn 0;$

ERROR: g_free(NULL) is safe this check is probably not required
#96: FILE: target/i386/kvm.c:1687:
+if (cpu->kvm_msr_buf) {
+g_free(cpu->kvm_msr_buf);

total: 2 errors, 0 warnings, 96 lines checked

Patch 14/25 has style problems, please

Re: [Qemu-devel] [External Email] Re: [PATCH 3/3] qapi: add block size histogram interface

2019-06-20 Thread zhenwei pi


On 6/20/19 10:03 PM, Eric Blake wrote:


On 6/20/19 3:54 AM, zhenwei pi wrote:

Set/Clear block size histograms through new command
x-block-size-histogram-set and show new statistics in
query-blockstats results.


I'm guessing this is modeled after the existing
block-latency-histogram-set command?


zhenwei: Yes, it is.


Signed-off-by: zhenwei pi 
---
  block/qapi.c |  24 
  blockdev.c   |  56 +++
  qapi/block-core.json | 105 ++-
  3 files changed, 184 insertions(+), 1 deletion(-)
+++ b/qapi/block-core.json
@@ -633,6 +633,100 @@
 '*boundaries-flush': ['uint64'] } }
  
  ##

+# @BlockSizeHistogramInfo:
+#
+# Block size histogram.
+#
+# @boundaries: list of interval boundary values in nanoseconds, all greater
+#  than zero and in ascending order.
+#  For example, the list [8193, 32769, 131073] produces the
+#  following histogram intervals:
+#  [0, 8193), [8193, 32769), [32769, 131073), [131073, +inf).
+#
+# @bins: list of io request counts corresponding to histogram intervals.
+#len(@bins) = len(@boundaries) + 1
+#For the example above, @bins may be something like [6, 3, 7, 9],
+#and corresponding histogram looks like:
+#
+# Since: 4.0

You've missed 4.0; the next release is 4.1.


zhenwei: OK, I will fix all the version info.


+##
+{ 'struct': 'BlockSizeHistogramInfo',
+  'data': {'boundaries': ['uint64'], 'bins': ['uint64'] } }

This is identical to struct BlockLatencyHistogramInfo; can we instead
rename the type (which does not affect API) and share it between both
implementations, instead of duplicating it?


zhenwei: Good idea. But I am confused about the compatibility of the
structure BlockLatencyHistogramInfo. If I rename BlockLatencyHistogramInfo
to BlockHistogramInfo, it will be common.


+
+##
+# @x-block-size-histogram-set:

Does this need to be experimental from the get-go? Or can it be stable
by dropping 'x-' since it matches the fact that
block-latency-histogram-set is stable?


zhenwei: OK, I will drop 'x-' prefix.


+#
+# Manage read, write and flush size histograms for the device.
+#
+# If only @id parameter is specified, remove all present size histograms
+# for the device. Otherwise, add/reset some of (or all) size histograms.
+#
+# @id: The name or QOM path of the guest device.
+#
+# @boundaries: list of interval boundary values (see description in
+#  BlockSizeHistogramInfo definition). If specified, all
+#  size histograms are removed, and empty ones created for all
+#  io types with intervals corresponding to @boundaries (except for
+#  io types, for which specific boundaries are set through the
+#  following parameters).
+#
+# @boundaries-read: list of interval boundary values for read size
+#   histogram. If specified, old read size histogram is
+#   removed, and empty one created with intervals
+#   corresponding to @boundaries-read. The parameter has higher
+#   priority then @boundaries.
+#
+# @boundaries-write: list of interval boundary values for write size
+#histogram.
+#
+# @boundaries-flush: list of interval boundary values for flush size
+#histogram.
+#
+# Returns: error if device is not found or any boundary arrays are invalid.
+#
+# Since: 4.0

4.1


+#
+# Example: set new histograms for all io types with intervals
+# [0, 8193), [8193, 32769), [32769, 131073), [131073, +inf):
+#
+# -> { "execute": "x-block-size-histogram-set",
+#  "arguments": { "id": "drive0",
+# "boundaries": [8193, 32769, 131073] } }
+# <- { "return": {} }
+#
+# Example: set new histogram only for write, other histograms will remain
+# not changed (or not created):
+#
+# -> { "execute": "x-block-size-histogram-set",
+#  "arguments": { "id": "drive0",
+# "boundaries-write": [8193, 32769, 131073] } }
+# <- { "return": {} }
+#
+# Example: set new histograms with the following intervals:
+#   read, flush: [0, 8193), [8193, 32769), [32769, 131073), [131073, +inf)
+#   write: [0, 4097), [4097, 8193), [8193, 32769), [32769, +inf)
+#
+# -> { "execute": "x-block-size-histogram-set",
+#  "arguments": { "id": "drive0",
+# "boundaries": [8193, 32769, 131073],
+# "boundaries-write": [4097, 8193, 32769] } }
+# <- { "return": {} }
+#
+# Example: remove all size histograms:
+#
+# -> { "execute": "x-block-size-histogram-set",
+#  "arguments": { "id": "drive0" } }
+# <- { "return": {} }
+##
+{ 'command': 'x-block-size-histogram-set',
+  'data': {'id': 'str',
+   '*boundaries': ['uint64'],
+   '*boundaries-read': ['uint64'],
+   '*boundaries-write': ['uint64'],
+   '*boundaries-flush': ['uint64'] } }

Again, this copies heavily from

Re: [Qemu-devel] [PATCH] pc: fix possible NULL pointer dereference in pc_machine_get_device_memory_region_size()

2019-06-20 Thread Eduardo Habkost

On Fri, Jun 21, 2019 at 02:29:29AM +0200, Paolo Bonzini wrote:
> On 10/06/19 15:50, Igor Mammedov wrote:
> > QEMU will crash when device-memory-region-size property is read if 
> > ms->device_memory
> > wasn't initialized yet.
> > 
> > Crash can be reproduced with:
> >  $QEMU -preconfig -qmp unix:qmp_socket,server,nowait &
> >  ./scripts/qmp/qom-get -s qmp_socket /machine.device-memory-region-size
> > 
> > Instead of crashing return 0 if ms->device_memory hasn't been initialized.
> 
> This patch breaks bios-tables-test /x86_64/acpi/piix64/cpuhp:
> 
> acpi-test: Warning! SRAT binary file mismatch. Actual [aml:/tmp/aml-RIFK3Z], 
> Expected [aml:tests/data/acpi/pc/SRAT.memhp].
> acpi-test: Warning! SRAT mismatch. Actual [asl:/tmp/asl-TLFK3Z.dsl, 
> aml:/tmp/aml-RIFK3Z], Expected [asl:/tmp/asl-JL5J3Z.dsl, 
> aml:tests/data/acpi/pc/SRAT.memhp].
> **
> ERROR:/home/pbonzini/work/upstream/qemu/tests/bios-tables-test.c:434:test_acpi_asl:
>  assertion failed: (all_tables_match)
> ERROR - Bail out! 
> ERROR:/home/pbonzini/work/upstream/qemu/tests/bios-tables-test.c:434:test_acpi_asl:
>  assertion failed: (all_tables_match)
> 
> So I'm removing it from the pull request.

The patch makes all memory regions return 0 as its size.


> > Signed-off-by: Igor Mammedov 
> > ---
> > v2:
> >   add reproducer to commit message
> >(Markus Armbruster )
> > 
> >  hw/i386/pc.c | 6 +-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index edc240b..1b7ead9 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -2459,7 +2459,11 @@ pc_machine_get_device_memory_region_size(Object 
> > *obj, Visitor *v,
> >   Error **errp)
> >  {
> >  MachineState *ms = MACHINE(obj);
> > -int64_t value = memory_region_size(>device_memory->mr);
> > +int64_t value = 0;
> > +
> > +if (ms->device_memory) {
> > +memory_region_size(>device_memory->mr);

This was supposed to be:

value = memory_region_size(>device_memory->mr);

> > +}
> >  
> >  visit_type_int(v, name, , errp);
> >  }
> > 
> 

-- 
Eduardo

[Qemu-devel] [PULL 21/25] target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Kernel commit c4f55198c7c2 ("kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD")
introduced a new KVM capability which allows userspace to correctly
distinguish between pending and injected exceptions.

This distinguish is important in case of nested virtualization scenarios
because a L2 pending exception can still be intercepted by the L1 hypervisor
while a L2 injected exception cannot.

Furthermore, when an exception is attempted to be injected by QEMU,
QEMU should specify the exception payload (CR2 in case of #PF or
DR6 in case of #DB) instead of having the payload already delivered in
the respective vCPU register. Because in case exception is injected to
L2 guest and is intercepted by L1 hypervisor, then payload needs to be
reported to L1 intercept (VMExit handler) while still preserving
respective vCPU register unchanged.

This commit adds support for QEMU to properly utilise this new KVM
capability (KVM_CAP_EXCEPTION_PAYLOAD).

Reviewed-by: Nikita Leshenko 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-10-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c|   6 ++-
 target/i386/cpu.h|   6 ++-
 target/i386/hvf/hvf.c|  10 +++--
 target/i386/hvf/x86hvf.c |   4 +-
 target/i386/kvm.c| 101 +++
 target/i386/machine.c|  84 ++-
 6 files changed, 187 insertions(+), 24 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 61e44cb..da6eb67 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4819,7 +4819,11 @@ static void x86_cpu_reset(CPUState *s)
 memset(env->mtrr_fixed, 0, sizeof(env->mtrr_fixed));
 
 env->interrupt_injected = -1;
-env->exception_injected = -1;
+env->exception_nr = -1;
+env->exception_pending = 0;
+env->exception_injected = 0;
+env->exception_has_payload = false;
+env->exception_payload = 0;
 env->nmi_injected = false;
 #if !defined(CONFIG_USER_ONLY)
 /* We hard-wire the BSP to the first CPU. */
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 16d898c..7e003b8 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1348,10 +1348,14 @@ typedef struct CPUX86State {
 
 /* For KVM */
 uint32_t mp_state;
-int32_t exception_injected;
+int32_t exception_nr;
 int32_t interrupt_injected;
 uint8_t soft_interrupt;
+uint8_t exception_pending;
+uint8_t exception_injected;
 uint8_t has_error_code;
+uint8_t exception_has_payload;
+uint64_t exception_payload;
 uint32_t ins_len;
 uint32_t sipi_vector;
 bool tsc_valid;
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index 2751c81..dc4bb63 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -605,7 +605,9 @@ static void hvf_store_events(CPUState *cpu, uint32_t 
ins_len, uint64_t idtvec_in
 X86CPU *x86_cpu = X86_CPU(cpu);
 CPUX86State *env = _cpu->env;
 
-env->exception_injected = -1;
+env->exception_nr = -1;
+env->exception_pending = 0;
+env->exception_injected = 0;
 env->interrupt_injected = -1;
 env->nmi_injected = false;
 if (idtvec_info & VMCS_IDT_VEC_VALID) {
@@ -619,7 +621,8 @@ static void hvf_store_events(CPUState *cpu, uint32_t 
ins_len, uint64_t idtvec_in
 break;
 case VMCS_IDT_VEC_HWEXCEPTION:
 case VMCS_IDT_VEC_SWEXCEPTION:
-env->exception_injected = idtvec_info & VMCS_IDT_VEC_VECNUM;
+env->exception_nr = idtvec_info & VMCS_IDT_VEC_VECNUM;
+env->exception_injected = 1;
 break;
 case VMCS_IDT_VEC_PRIV_SWEXCEPTION:
 default:
@@ -912,7 +915,8 @@ int hvf_vcpu_exec(CPUState *cpu)
 macvm_set_rip(cpu, rip + ins_len);
 break;
 case VMX_REASON_VMCALL:
-env->exception_injected = EXCP0D_GPF;
+env->exception_nr = EXCP0D_GPF;
+env->exception_injected = 1;
 env->has_error_code = true;
 env->error_code = 0;
 break;
diff --git a/target/i386/hvf/x86hvf.c b/target/i386/hvf/x86hvf.c
index df8e946..e0ea02d 100644
--- a/target/i386/hvf/x86hvf.c
+++ b/target/i386/hvf/x86hvf.c
@@ -362,8 +362,8 @@ bool hvf_inject_interrupts(CPUState *cpu_state)
 if (env->interrupt_injected != -1) {
 vector = env->interrupt_injected;
 intr_type = VMCS_INTR_T_SWINTR;
-} else if (env->exception_injected != -1) {
-vector = env->exception_injected;
+} else if (env->exception_nr != -1) {
+vector = env->exception_nr;
 if (vector == EXCP03_INT3 || vector == EXCP04_INTO) {
 intr_type = VMCS_INTR_T_SWEXCEPTION;
 } else {
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index e924663..ab812b5 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -104,6 +104,7 @@ static uint32_t num_architectural_pmu_fixed_counters;
 static int has_xsave;
 static int has_xcrs;
 static int

[Qemu-devel] [PULL 20/25] target/i386: kvm: Add support for save and restore nested state

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Kernel commit 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE")
introduced new IOCTLs to extract and restore vCPU state related to
Intel VMX & AMD SVM.

Utilize these IOCTLs to add support for migration of VMs which are
running nested hypervisors.

Reviewed-by: Nikita Leshenko 
Reviewed-by: Maran Wilson 
Tested-by: Maran Wilson 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-9-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 accel/kvm/kvm-all.c   |   8 ++
 include/sysemu/kvm.h  |   1 +
 target/i386/cpu.h |   3 +
 target/i386/kvm.c |  80 
 target/i386/machine.c | 198 ++
 5 files changed, 290 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f0f5ab8..e3cf728 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -87,6 +87,7 @@ struct KVMState
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
 #endif
+int max_nested_state_len;
 int many_ioeventfds;
 int intx_set_mask;
 bool sync_mmu;
@@ -1681,6 +1682,8 @@ static int kvm_init(MachineState *ms)
 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
 #endif
 
+s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
+
 #ifdef KVM_CAP_IRQ_ROUTING
 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
 #endif
@@ -2248,6 +2251,11 @@ int kvm_has_debugregs(void)
 return kvm_state->debugregs;
 }
 
+int kvm_max_nested_state_length(void)
+{
+return kvm_state->max_nested_state_len;
+}
+
 int kvm_has_many_ioeventfds(void)
 {
 if (!kvm_enabled()) {
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 64f55e5..acd90ae 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -210,6 +210,7 @@ bool kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
 int kvm_has_robust_singlestep(void);
 int kvm_has_debugregs(void);
+int kvm_max_nested_state_length(void);
 int kvm_has_pit_state2(void);
 int kvm_has_many_ioeventfds(void);
 int kvm_has_gsi_routing(void);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 4ae4145..16d898c 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1360,6 +1360,9 @@ typedef struct CPUX86State {
 #if defined(CONFIG_KVM) || defined(CONFIG_HVF)
 void *xsave_buf;
 #endif
+#if defined(CONFIG_KVM)
+struct kvm_nested_state *nested_state;
+#endif
 #if defined(CONFIG_HVF)
 HVFX86EmulatorState *hvf_emul;
 #endif
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index f9872f1..e924663 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1324,6 +1324,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 struct kvm_cpuid_entry2 *c;
 uint32_t signature[3];
 int kvm_base = KVM_CPUID_SIGNATURE;
+int max_nested_state_len;
 int r;
 Error *local_err = NULL;
 
@@ -1658,6 +1659,24 @@ int kvm_arch_init_vcpu(CPUState *cs)
 if (has_xsave) {
 env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
 }
+
+max_nested_state_len = kvm_max_nested_state_length();
+if (max_nested_state_len > 0) {
+assert(max_nested_state_len >= offsetof(struct kvm_nested_state, 
data));
+env->nested_state = g_malloc0(max_nested_state_len);
+
+env->nested_state->size = max_nested_state_len;
+
+if (IS_INTEL_CPU(env)) {
+struct kvm_vmx_nested_state_hdr *vmx_hdr =
+>nested_state->hdr.vmx;
+
+env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
+vmx_hdr->vmxon_pa = -1ull;
+vmx_hdr->vmcs12_pa = -1ull;
+}
+}
+
 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
 
 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
@@ -1682,12 +1701,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
 int kvm_arch_destroy_vcpu(CPUState *cs)
 {
 X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = >env;
 
 if (cpu->kvm_msr_buf) {
 g_free(cpu->kvm_msr_buf);
 cpu->kvm_msr_buf = NULL;
 }
 
+if (env->nested_state) {
+g_free(env->nested_state);
+env->nested_state = NULL;
+}
+
 return 0;
 }
 
@@ -3411,6 +3436,52 @@ static int kvm_get_debugregs(X86CPU *cpu)
 return 0;
 }
 
+static int kvm_put_nested_state(X86CPU *cpu)
+{
+CPUX86State *env = >env;
+int max_nested_state_len = kvm_max_nested_state_length();
+
+if (max_nested_state_len <= 0) {
+return 0;
+}
+
+assert(env->nested_state->size <= max_nested_state_len);
+return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
+}
+
+static int kvm_get_nested_state(X86CPU *cpu)
+{
+CPUX86State *env = >env;
+int max_nested_state_len = kvm_max_nested_state_length();
+int ret;
+
+if (max_nested_state_len <= 0) {
+return 0;
+}
+
+/*
+ * It is possible that migration restored a smaller size into
+ * nested_state->hdr.size than what our kernel support.
+ * We

[Qemu-devel] [PULL 25/25] hw: Nuke hw_compat_4_0_1 and pc_compat_4_0_1

2019-06-20 Thread Paolo Bonzini

From: Greg Kurz 

Commit c87759ce876a fixed a regression affecting pc-q35 machines by
introducing a new pc-q35-4.0.1 machine version to be used instead
of pc-q35-4.0. The only purpose was to revert the default behaviour
of not using split irqchip, but the change also introduced the usual
hw_compat and pc_compat bits, and wired them for pc-q35 only.

This raises questions when it comes to add new compat properties for
4.0* machine versions of any architecture. Where to add them ? In
4.0, 4.0.1 or both ? Error prone. Another possibility would be to teach
all other architectures about 4.0.1. This solution isn't satisfying,
especially since this is a pc-q35 specific issue.

It turns out that the split irqchip default is handled in the machine
option function and doesn't involve compat lists at all.

Drop all the 4.0.1 compat lists and use the 4.0 ones instead in the 4.0.1
machine option function.

Move the compat props that were added to the 4.0.1 since c87759ce876a to
4.0.

Even if only hw_compat_4_0_1 had an impact on other architectures,
drop pc_compat_4_0_1 as well for consistency.

Fixes: c87759ce876a "q35: Revert to kernel irqchip"
Suggested-by: Dr. David Alan Gilbert 
Signed-off-by: Greg Kurz 
Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: Michael S. Tsirkin 
Message-Id: <156051774276.244890.8660277280145466396.st...@bahia.lan>
Signed-off-by: Paolo Bonzini 
---
 hw/core/machine.c|  5 +
 hw/i386/pc.c |  3 ---
 hw/i386/pc_q35.c | 12 
 include/hw/boards.h  |  3 ---
 include/hw/i386/pc.h |  3 ---
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 84ebb8d..ea5a01a 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -24,16 +24,13 @@
 #include "hw/pci/pci.h"
 #include "hw/mem/nvdimm.h"
 
-GlobalProperty hw_compat_4_0_1[] = {
+GlobalProperty hw_compat_4_0[] = {
 { "VGA","edid", "false" },
 { "secondary-vga",  "edid", "false" },
 { "bochs-display",  "edid", "false" },
 { "virtio-vga", "edid", "false" },
 { "virtio-gpu-pci", "edid", "false" },
 };
-const size_t hw_compat_4_0_1_len = G_N_ELEMENTS(hw_compat_4_0_1);
-
-GlobalProperty hw_compat_4_0[] = {};
 const size_t hw_compat_4_0_len = G_N_ELEMENTS(hw_compat_4_0);
 
 GlobalProperty hw_compat_3_1[] = {
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index e41192b..e96360b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -111,9 +111,6 @@ struct hpet_fw_config hpet_cfg = {.count = UINT8_MAX};
 /* Physical Address of PVH entry point read from kernel ELF NOTE */
 static size_t pvh_start_addr;
 
-GlobalProperty pc_compat_4_0_1[] = {};
-const size_t pc_compat_4_0_1_len = G_N_ELEMENTS(pc_compat_4_0_1);
-
 GlobalProperty pc_compat_4_0[] = {};
 const size_t pc_compat_4_0_len = G_N_ELEMENTS(pc_compat_4_0);
 
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index dcddc64..57232ae 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -378,8 +378,13 @@ static void pc_q35_4_0_1_machine_options(MachineClass *m)
 {
 pc_q35_4_1_machine_options(m);
 m->alias = NULL;
-compat_props_add(m->compat_props, hw_compat_4_0_1, hw_compat_4_0_1_len);
-compat_props_add(m->compat_props, pc_compat_4_0_1, pc_compat_4_0_1_len);
+/*
+ * This is the default machine for the 4.0-stable branch. It is basically
+ * a 4.0 that doesn't use split irqchip by default. It MUST hence apply the
+ * 4.0 compat props.
+ */
+compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
+compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
 }
 
 DEFINE_Q35_MACHINE(v4_0_1, "pc-q35-4.0.1", NULL,
@@ -390,8 +395,7 @@ static void pc_q35_4_0_machine_options(MachineClass *m)
 pc_q35_4_0_1_machine_options(m);
 m->default_kernel_irqchip_split = true;
 m->alias = NULL;
-compat_props_add(m->compat_props, hw_compat_4_0, hw_compat_4_0_len);
-compat_props_add(m->compat_props, pc_compat_4_0, pc_compat_4_0_len);
+/* Compat props are applied by the 4.0.1 machine */
 }
 
 DEFINE_Q35_MACHINE(v4_0, "pc-q35-4.0", NULL,
diff --git a/include/hw/boards.h b/include/hw/boards.h
index b7362af..eaa050a 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -293,9 +293,6 @@ struct MachineState {
 } \
 type_init(machine_initfn##_register_types)
 
-extern GlobalProperty hw_compat_4_0_1[];
-extern const size_t hw_compat_4_0_1_len;
-
 extern GlobalProperty hw_compat_4_0[];
 extern const size_t hw_compat_4_0_len;
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index a7d0b87..c54cc54 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -293,9 +293,6 @@ int e820_add_entry(uint64_t, uint64_t, uint32_t);
 int e820_get_num_entries(void);
 bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 
-extern GlobalProperty pc_compat_4_0_1[];
-extern const size_t pc_compat_4_0_1_len;
-
 extern GlobalProperty pc_compat_4_0[];
 extern const size_t pc_compat_4_0_len;
 
-- 
1.8.3.1

[Qemu-devel] [PULL 17/25] target/i386: kvm: Block migration for vCPUs exposed with nested virtualization

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Commit d98f26073beb ("target/i386: kvm: add VMX migration blocker")
added a migration blocker for vCPU exposed with Intel VMX.
However, migration should also be blocked for vCPU exposed with
AMD SVM.

Both cases should be blocked because QEMU should extract additional
vCPU state from KVM that should be migrated as part of vCPU VMState.
E.g. Whether vCPU is running in guest-mode or host-mode.

Fixes: d98f26073beb ("target/i386: kvm: add VMX migration blocker")
Reviewed-by: Maran Wilson 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-6-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c |  6 --
 target/i386/cpu.h | 22 ++
 target/i386/kvm.c | 14 +++---
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c330fd9..61e44cb 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5215,12 +5215,6 @@ static int x86_cpu_filter_features(X86CPU *cpu)
 return rv;
 }
 
-#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
-   (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
-   (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
-#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
- (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
- (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
 static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
 {
 CPUState *cs = CPU(dev);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7f48136..4ae4145 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -722,6 +722,13 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 
 #define CPUID_VENDOR_HYGON"HygonGenuine"
 
+#define IS_INTEL_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_INTEL_1 && \
+   (env)->cpuid_vendor2 == CPUID_VENDOR_INTEL_2 && \
+   (env)->cpuid_vendor3 == CPUID_VENDOR_INTEL_3)
+#define IS_AMD_CPU(env) ((env)->cpuid_vendor1 == CPUID_VENDOR_AMD_1 && \
+ (env)->cpuid_vendor2 == CPUID_VENDOR_AMD_2 && \
+ (env)->cpuid_vendor3 == CPUID_VENDOR_AMD_3)
+
 #define CPUID_MWAIT_IBE (1U << 1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX (1U << 0) /* enumeration supported */
 
@@ -1848,6 +1855,21 @@ static inline int32_t x86_get_a20_mask(CPUX86State *env)
 }
 }
 
+static inline bool cpu_has_vmx(CPUX86State *env)
+{
+return (env->features[FEAT_1_ECX] & CPUID_EXT_VMX);
+}
+
+static inline bool cpu_has_svm(CPUX86State *env)
+{
+return (env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_SVM);
+}
+
+static inline bool cpu_has_nested_virt(CPUX86State *env)
+{
+return (cpu_has_vmx(env) || cpu_has_svm(env));
+}
+
 /* fpu_helper.c */
 void update_fp_status(CPUX86State *env);
 void update_mxcsr_status(CPUX86State *env);
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 9864aa0..f9872f1 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1299,7 +1299,7 @@ static int hyperv_init_vcpu(X86CPU *cpu)
 }
 
 static Error *invtsc_mig_blocker;
-static Error *vmx_mig_blocker;
+static Error *nested_virt_mig_blocker;
 
 #define KVM_MAX_CPUID_ENTRIES  100
 
@@ -1597,13 +1597,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
   !!(c->ecx & CPUID_EXT_SMX);
 }
 
-if ((env->features[FEAT_1_ECX] & CPUID_EXT_VMX) && !vmx_mig_blocker) {
-error_setg(_mig_blocker,
-   "Nested VMX virtualization does not support live migration 
yet");
-r = migrate_add_blocker(vmx_mig_blocker, _err);
+if (cpu_has_nested_virt(env) && !nested_virt_mig_blocker) {
+error_setg(_virt_mig_blocker,
+   "Nested virtualization does not support live migration 
yet");
+r = migrate_add_blocker(nested_virt_mig_blocker, _err);
 if (local_err) {
 error_report_err(local_err);
-error_free(vmx_mig_blocker);
+error_free(nested_virt_mig_blocker);
 return r;
 }
 }
@@ -1674,7 +1674,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
  fail:
 migrate_del_blocker(invtsc_mig_blocker);
  fail2:
-migrate_del_blocker(vmx_mig_blocker);
+migrate_del_blocker(nested_virt_mig_blocker);
 
 return r;
 }
-- 
1.8.3.1

[Qemu-devel] [PULL 12/25] target/i386: define a new MSR based feature word - FEAT_CORE_CAPABILITY

2019-06-20 Thread Paolo Bonzini

From: Xiaoyao Li 

MSR IA32_CORE_CAPABILITY is a feature-enumerating MSR, which only
enumerates the feature split lock detection (via bit 5) by now.

The existence of MSR IA32_CORE_CAPABILITY is enumerated by CPUID.7_0:EDX[30].

The latest kernel patches about them can be found here:
https://lkml.org/lkml/2019/4/24/1909

Signed-off-by: Xiaoyao Li 
Message-Id: <20190617153654.916-1-xiaoyao...@linux.intel.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.c | 22 +-
 target/i386/cpu.h |  5 +
 target/i386/kvm.c |  9 +
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 7beb8ab..c330fd9 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1085,7 +1085,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = 
{
 NULL, NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
 NULL, NULL, "spec-ctrl", "stibp",
-NULL, "arch-capabilities", NULL, "ssbd",
+NULL, "arch-capabilities", "core-capability", "ssbd",
 },
 .cpuid = {
 .eax = 7,
@@ -1203,6 +1203,26 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 }
 },
 },
+[FEAT_CORE_CAPABILITY] = {
+.type = MSR_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "split-lock-detect", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.msr = {
+.index = MSR_IA32_CORE_CAPABILITY,
+.cpuid_dep = {
+FEAT_7_0_EDX,
+CPUID_7_0_EDX_CORE_CAPABILITY,
+},
+},
+},
 };
 
 typedef struct X86RegisterInfo32 {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7470acf..7f48136 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -345,6 +345,7 @@ typedef enum X86Seg {
 #define MSR_IA32_SPEC_CTRL  0x48
 #define MSR_VIRT_SSBD   0xc001011f
 #define MSR_IA32_PRED_CMD   0x49
+#define MSR_IA32_CORE_CAPABILITY0xcf
 #define MSR_IA32_ARCH_CAPABILITIES  0x10a
 #define MSR_IA32_TSCDEADLINE0x6e0
 
@@ -496,6 +497,7 @@ typedef enum FeatureWord {
 FEAT_XSAVE_COMP_LO, /* CPUID[EAX=0xd,ECX=0].EAX */
 FEAT_XSAVE_COMP_HI, /* CPUID[EAX=0xd,ECX=0].EDX */
 FEAT_ARCH_CAPABILITIES,
+FEAT_CORE_CAPABILITY,
 FEATURE_WORDS,
 } FeatureWord;
 
@@ -687,6 +689,7 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) /* AVX512 Multiply Accumulation 
Single Precision */
 #define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) /* Speculation Control */
 #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29)  /*Arch Capabilities*/
+#define CPUID_7_0_EDX_CORE_CAPABILITY   (1U << 30)  /*Core Capability*/
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass 
Disable */
 
 #define CPUID_8000_0008_EBX_WBNOINVD  (1U << 9)  /* Write back and
@@ -734,6 +737,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3)
 #define MSR_ARCH_CAP_SSB_NO (1U << 4)
 
+#define MSR_CORE_CAP_SPLIT_LOCK_DETECT  (1U << 5)
+
 /* Supported Hyper-V Enlightenments */
 #define HYPERV_FEAT_RELAXED 0
 #define HYPERV_FEAT_VAPIC   1
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index a323b1f..279f99a 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -95,6 +95,7 @@ static bool has_msr_spec_ctrl;
 static bool has_msr_virt_ssbd;
 static bool has_msr_smi_count;
 static bool has_msr_arch_capabs;
+static bool has_msr_core_capabs;
 
 static uint32_t has_architectural_pmu_version;
 static uint32_t num_architectural_pmu_gp_counters;
@@ -1842,6 +1843,9 @@ static int kvm_get_supported_msrs(KVMState *s)
 case MSR_IA32_ARCH_CAPABILITIES:
 has_msr_arch_capabs = true;
 break;
+case MSR_IA32_CORE_CAPABILITY:
+has_msr_core_capabs = true;
+break;
 }
 }
 }
@@ -2368,6 +2372,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
   env->features[FEAT_ARCH_CAPABILITIES]);
 }
 
+if (has_msr_core_capabs) {
+kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
+  env->features[FEAT_CORE_CAPABILITY]);
+}
+
 /*
  * The following MSRs have side effects on the guest or are too heavy
  * for normal writeback. Limit them to reset or full state updates.
-- 
1.8.3.1

[Qemu-devel] [PULL 11/25] i386/kvm: add support for Direct Mode for Hyper-V synthetic timers

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Hyper-V on KVM can only use Synthetic timers with Direct Mode (opting for
an interrupt instead of VMBus message). This new capability is only
announced in KVM_GET_SUPPORTED_HV_CPUID.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-10-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 docs/hyperv.txt| 10 ++
 target/i386/cpu.c  |  2 ++
 target/i386/cpu.h  |  1 +
 target/i386/hyperv-proto.h |  1 +
 target/i386/kvm.c  |  9 +
 5 files changed, 23 insertions(+)

diff --git a/docs/hyperv.txt b/docs/hyperv.txt
index beadb2d..8fdf25c 100644
--- a/docs/hyperv.txt
+++ b/docs/hyperv.txt
@@ -174,6 +174,16 @@ without the feature to find out if enabling it is 
beneficial.
 
 Requires: hv-vapic
 
+3.17. hv-stimer-direct
+===
+Hyper-V specification allows synthetic timer operation in two modes: "classic",
+when expiration event is delivered as SynIC message and "direct", when the 
event
+is delivered via normal interrupt. It is known that nested Hyper-V can only
+use synthetic timers in direct mode and thus 'hv-stimer-direct' needs to be
+enabled.
+
+Requires: hv-vpindex, hv-synic, hv-time, hv-stimer
+
 
 4. Development features
 
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e07996c..7beb8ab 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5883,6 +5883,8 @@ static Property x86_cpu_properties[] = {
   HYPERV_FEAT_EVMCS, 0),
 DEFINE_PROP_BIT64("hv-ipi", X86CPU, hyperv_features,
   HYPERV_FEAT_IPI, 0),
+DEFINE_PROP_BIT64("hv-stimer-direct", X86CPU, hyperv_features,
+  HYPERV_FEAT_STIMER_DIRECT, 0),
 DEFINE_PROP_BOOL("hv-passthrough", X86CPU, hyperv_passthrough, false),
 
 DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 86edbf5..7470acf 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -749,6 +749,7 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define HYPERV_FEAT_TLBFLUSH11
 #define HYPERV_FEAT_EVMCS   12
 #define HYPERV_FEAT_IPI 13
+#define HYPERV_FEAT_STIMER_DIRECT   14
 
 #ifndef HYPERV_SPINLOCK_NEVER_RETRY
 #define HYPERV_SPINLOCK_NEVER_RETRY 0x
diff --git a/target/i386/hyperv-proto.h b/target/i386/hyperv-proto.h
index c0272b3..cffac10 100644
--- a/target/i386/hyperv-proto.h
+++ b/target/i386/hyperv-proto.h
@@ -49,6 +49,7 @@
 #define HV_GUEST_IDLE_STATE_AVAILABLE   (1u << 5)
 #define HV_FREQUENCY_MSRS_AVAILABLE (1u << 8)
 #define HV_GUEST_CRASH_MSR_AVAILABLE(1u << 10)
+#define HV_STIMER_DIRECT_MODE_AVAILABLE (1u << 19)
 
 /*
  * HV_CPUID_ENLIGHTMENT_INFO.EAX bits
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 86de510..a323b1f 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -802,6 +802,14 @@ static struct {
 },
 .dependencies = BIT(HYPERV_FEAT_VPINDEX)
 },
+[HYPERV_FEAT_STIMER_DIRECT] = {
+.desc = "direct mode synthetic timers (hv-stimer-direct)",
+.flags = {
+{.fw = FEAT_HYPERV_EDX,
+ .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
+},
+.dependencies = BIT(HYPERV_FEAT_STIMER)
+},
 };
 
 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max)
@@ -1124,6 +1132,7 @@ static int hyperv_handle_properties(CPUState *cs,
 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TLBFLUSH);
 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_EVMCS);
 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_IPI);
+r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER_DIRECT);
 
 /* Additional dependencies not covered by kvm_hyperv_properties[] */
 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
-- 
1.8.3.1

[Qemu-devel] [PULL 13/25] target/i386: kvm: Delete VMX migration blocker on vCPU init failure

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Commit d98f26073beb ("target/i386: kvm: add VMX migration blocker")
added migration blocker for vCPU exposed with Intel VMX because QEMU
doesn't yet contain code to support migration of nested virtualization
workloads.

However, that commit missed adding deletion of the migration blocker in
case init of vCPU failed. Similar to invtsc_mig_blocker. This commit fix
that issue.

Fixes: d98f26073beb ("target/i386: kvm: add VMX migration blocker")
Signed-off-by: Liran Alon 
Reviewed-by: Maran Wilson 
Message-Id: <20190619162140.133674-2-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 279f99a..c5cbead 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1333,7 +1333,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 r = kvm_arch_set_tsc_khz(cs);
 if (r < 0) {
-goto fail;
+return r;
 }
 
 /* vcpu's TSC frequency is either specified by user, or following
@@ -1622,7 +1622,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 if (local_err) {
 error_report_err(local_err);
 error_free(invtsc_mig_blocker);
-return r;
+goto fail2;
 }
 }
 }
@@ -1673,6 +1673,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
  fail:
 migrate_del_blocker(invtsc_mig_blocker);
+ fail2:
+migrate_del_blocker(vmx_mig_blocker);
+
 return r;
 }
 
-- 
1.8.3.1

[Qemu-devel] [PULL 15/25] target/i386: kvm: Use symbolic constant for #DB/#BP exception constants

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Reviewed-by: Nikita Leshenko 
Reviewed-by: Krish Sadhukhan 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-4-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 26d8c61..c763643 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -3345,9 +3345,9 @@ static int kvm_guest_debug_workarounds(X86CPU *cpu)
 unsigned long reinject_trap = 0;
 
 if (!kvm_has_vcpu_events()) {
-if (env->exception_injected == 1) {
+if (env->exception_injected == EXCP01_DB) {
 reinject_trap = KVM_GUESTDBG_INJECT_DB;
-} else if (env->exception_injected == 3) {
+} else if (env->exception_injected == EXCP03_INT3) {
 reinject_trap = KVM_GUESTDBG_INJECT_BP;
 }
 env->exception_injected = -1;
@@ -3859,8 +3859,8 @@ static int kvm_handle_debug(X86CPU *cpu,
 int ret = 0;
 int n;
 
-if (arch_info->exception == 1) {
-if (arch_info->dr6 & (1 << 14)) {
+if (arch_info->exception == EXCP01_DB) {
+if (arch_info->dr6 & DR6_BS) {
 if (cs->singlestep_enabled) {
 ret = EXCP_DEBUG;
 }
-- 
1.8.3.1

[Qemu-devel] [PULL 23/25] sd: Fix out-of-bounds assertions

2019-06-20 Thread Paolo Bonzini

From: Lidong Chen 

Due to an off-by-one error, the assert statements allow an
out-of-bound array access.  This doesn't happen in practice,
but the static analyzer notices.

Signed-off-by: Lidong Chen 
Reviewed-by: Liam Merwick 
Reviewed-by: Marc-André Lureau 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Li Qiang 
Reviewed-by: Darren Kenny 
Message-Id: 
<6b19cb7359a10a6bedc3ea0fce22fed3ef93c102.1560806687.git.lidong.c...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 hw/sd/sd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 60500ec..917195a6 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -145,7 +145,7 @@ static const char *sd_state_name(enum SDCardStates state)
 if (state == sd_inactive_state) {
 return "inactive";
 }
-assert(state <= ARRAY_SIZE(state_name));
+assert(state < ARRAY_SIZE(state_name));
 return state_name[state];
 }
 
@@ -166,7 +166,7 @@ static const char *sd_response_name(sd_rsp_type_t rsp)
 if (rsp == sd_r1b) {
 rsp = sd_r1;
 }
-assert(rsp <= ARRAY_SIZE(response_name));
+assert(rsp < ARRAY_SIZE(response_name));
 return response_name[rsp];
 }
 
-- 
1.8.3.1

[Qemu-devel] [PULL 22/25] target/i386: kvm: Add nested migration blocker only when kernel lacks required capabilities

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Previous commits have added support for migration of nested virtualization
workloads. This was done by utilising two new KVM capabilities:
KVM_CAP_NESTED_STATE and KVM_CAP_EXCEPTION_PAYLOAD. Both which are
required in order to correctly migrate such workloads.

Therefore, change code to add a migration blocker for vCPUs exposed with
Intel VMX or AMD SVM in case one of these kernel capabilities is
missing.

Signed-off-by: Liran Alon 
Reviewed-by: Maran Wilson 
Message-Id: <20190619162140.133674-11-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 9 +++--
 target/i386/machine.c | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index ab812b5..f08eab4 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1640,9 +1640,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
   !!(c->ecx & CPUID_EXT_SMX);
 }
 
-if (cpu_has_nested_virt(env) && !nested_virt_mig_blocker) {
+if (cpu_has_vmx(env) && !nested_virt_mig_blocker &&
+((kvm_max_nested_state_length() <= 0) || !has_exception_payload)) {
 error_setg(_virt_mig_blocker,
-   "Nested virtualization does not support live migration 
yet");
+   "Kernel do not provide required capabilities for "
+   "nested virtualization migration. "
+   "(CAP_NESTED_STATE=%d, CAP_EXCEPTION_PAYLOAD=%d)",
+   kvm_max_nested_state_length() > 0,
+   has_exception_payload);
 r = migrate_add_blocker(nested_virt_mig_blocker, _err);
 if (local_err) {
 error_report_err(local_err);
diff --git a/target/i386/machine.c b/target/i386/machine.c
index f66f342..e077989 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -233,7 +233,7 @@ static int cpu_pre_save(void *opaque)
 
 #ifdef CONFIG_KVM
 /* Verify we have nested virtualization state from kernel if required */
-if (cpu_has_nested_virt(env) && !env->nested_state) {
+if (kvm_enabled() && cpu_has_vmx(env) && !env->nested_state) {
 error_report("Guest enabled nested virtualization but kernel "
 "does not support saving of nested state");
 return -EINVAL;
-- 
1.8.3.1

[Qemu-devel] [PULL 07/25] i386/kvm: implement 'hv-passthrough' mode

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

In many case we just want to give Windows guests all currently supported
Hyper-V enlightenments and that's where this new mode may come handy. We
pass through what was returned by KVM_GET_SUPPORTED_HV_CPUID.

hv_cpuid_check_and_set() is modified to also set cpu->hyperv_* flags as
we may want to check them later (and we actually do for hv_runtime,
hv_synic,...).

'hv-passthrough' is a development only feature, a migration blocker is
added to prevent issues while migrating between hosts with different
feature sets.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-6-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 docs/hyperv.txt   | 10 +++
 target/i386/cpu.c |  1 +
 target/i386/cpu.h |  1 +
 target/i386/kvm.c | 89 ++-
 4 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/docs/hyperv.txt b/docs/hyperv.txt
index c423e0f..beadb2d 100644
--- a/docs/hyperv.txt
+++ b/docs/hyperv.txt
@@ -175,6 +175,16 @@ without the feature to find out if enabling it is 
beneficial.
 Requires: hv-vapic
 
 
+4. Development features
+
+In some cases (e.g. during development) it may make sense to use QEMU in
+'pass-through' mode and give Windows guests all enlightenments currently
+supported by KVM. This pass-through mode is enabled by "hv-passthrough" CPU
+flag.
+Note: enabling this flag effectively prevents migration as supported features
+may differ between target and destination.
+
+
 4. Useful links
 
 Hyper-V Top Level Functional specification and other information:
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e90c1ac..e07996c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5883,6 +5883,7 @@ static Property x86_cpu_properties[] = {
   HYPERV_FEAT_EVMCS, 0),
 DEFINE_PROP_BIT64("hv-ipi", X86CPU, hyperv_features,
   HYPERV_FEAT_IPI, 0),
+DEFINE_PROP_BOOL("hv-passthrough", X86CPU, hyperv_passthrough, false),
 
 DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
 DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 30cd1a0..86edbf5 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1390,6 +1390,7 @@ struct X86CPU {
 char *hyperv_vendor_id;
 bool hyperv_synic_kvm_only;
 uint64_t hyperv_features;
+bool hyperv_passthrough;
 
 bool check_cpuid;
 bool enforce_cpuid;
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index a041b4d..93ac6ba 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -639,7 +639,7 @@ static bool hyperv_enabled(X86CPU *cpu)
 CPUState *cs = CPU(cpu);
 return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY) ||
- cpu->hyperv_features);
+ cpu->hyperv_features || cpu->hyperv_passthrough);
 }
 
 static int kvm_arch_set_tsc_khz(CPUState *cs)
@@ -985,10 +985,10 @@ static int hv_cpuid_check_and_set(CPUState *cs, struct 
kvm_cpuid2 *cpuid,
 {
 X86CPU *cpu = X86_CPU(cs);
 CPUX86State *env = >env;
-uint32_t r, fw, bits;;
+uint32_t r, fw, bits;
 int i;
 
-if (!hyperv_feat_enabled(cpu, feature)) {
+if (!hyperv_feat_enabled(cpu, feature) && !cpu->hyperv_passthrough) {
 return 0;
 }
 
@@ -1001,15 +1001,23 @@ static int hv_cpuid_check_and_set(CPUState *cs, struct 
kvm_cpuid2 *cpuid,
 }
 
 if (hv_cpuid_get_fw(cpuid, fw, ) || (r & bits) != bits) {
-fprintf(stderr,
-"Hyper-V %s is not supported by kernel\n",
-kvm_hyperv_properties[feature].desc);
-return 1;
+if (hyperv_feat_enabled(cpu, feature)) {
+fprintf(stderr,
+"Hyper-V %s is not supported by kernel\n",
+kvm_hyperv_properties[feature].desc);
+return 1;
+} else {
+return 0;
+}
 }
 
 env->features[fw] |= bits;
 }
 
+if (cpu->hyperv_passthrough) {
+cpu->hyperv_features |= BIT(feature);
+}
+
 return 0;
 }
 
@@ -1027,22 +1035,29 @@ static int hyperv_handle_properties(CPUState *cs,
 struct kvm_cpuid_entry2 *c;
 uint32_t signature[3];
 uint32_t cpuid_i = 0;
-int r = 0;
+int r;
 
 if (!hyperv_enabled(cpu))
 return 0;
 
-if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
+if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ||
+cpu->hyperv_passthrough) {
 uint16_t evmcs_version;
 
-if (kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
-(uintptr_t)_version)) {
+r = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
+(uintptr_t)_version);
+
+if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) && r) {

[Qemu-devel] [PULL 14/25] KVM: Introduce kvm_arch_destroy_vcpu()

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Simiar to how kvm_init_vcpu() calls kvm_arch_init_vcpu() to perform
arch-dependent initialisation, introduce kvm_arch_destroy_vcpu()
to be called from kvm_destroy_vcpu() to perform arch-dependent
destruction.

This was added because some architectures (Such as i386)
currently do not free memory that it have allocated in
kvm_arch_init_vcpu().

Suggested-by: Maran Wilson 
Reviewed-by: Maran Wilson 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-3-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 accel/kvm/kvm-all.c  |  5 +
 include/sysemu/kvm.h |  1 +
 target/arm/kvm32.c   |  5 +
 target/arm/kvm64.c   |  5 +
 target/i386/kvm.c| 12 
 target/mips/kvm.c|  5 +
 target/ppc/kvm.c |  5 +
 target/s390x/kvm.c   | 10 ++
 8 files changed, 48 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index d2f481a..f0f5ab8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -291,6 +291,11 @@ int kvm_destroy_vcpu(CPUState *cpu)
 
 DPRINTF("kvm_destroy_vcpu\n");
 
+ret = kvm_arch_destroy_vcpu(cpu);
+if (ret < 0) {
+goto err;
+}
+
 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 if (mmap_size < 0) {
 ret = mmap_size;
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index a6d1cd1..64f55e5 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -371,6 +371,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level);
 int kvm_arch_init(MachineState *ms, KVMState *s);
 
 int kvm_arch_init_vcpu(CPUState *cpu);
+int kvm_arch_destroy_vcpu(CPUState *cpu);
 
 bool kvm_vcpu_id_is_valid(int vcpu_id);
 
diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
index 4e54e37..51f78f7 100644
--- a/target/arm/kvm32.c
+++ b/target/arm/kvm32.c
@@ -240,6 +240,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return kvm_arm_init_cpreg_list(cpu);
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+   return 0;
+}
+
 typedef struct Reg {
 uint64_t id;
 int offset;
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 998d21f..22d19c9 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -654,6 +654,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return kvm_arm_init_cpreg_list(cpu);
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+return 0;
+}
+
 bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx)
 {
 /* Return true if the regidx is a register we should synchronize
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index c5cbead..26d8c61 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1679,6 +1679,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return r;
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+X86CPU *cpu = X86_CPU(cs);
+
+if (cpu->kvm_msr_buf) {
+g_free(cpu->kvm_msr_buf);
+cpu->kvm_msr_buf = NULL;
+}
+
+return 0;
+}
+
 void kvm_arch_reset_vcpu(X86CPU *cpu)
 {
 CPUX86State *env = >env;
diff --git a/target/mips/kvm.c b/target/mips/kvm.c
index 8e72850..938f8f1 100644
--- a/target/mips/kvm.c
+++ b/target/mips/kvm.c
@@ -91,6 +91,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+return 0;
+}
+
 void kvm_mips_reset_vcpu(MIPSCPU *cpu)
 {
 CPUMIPSState *env = >env;
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index d4107dd..4b4989c 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -521,6 +521,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+return 0;
+}
+
 static void kvm_sw_tlb_put(PowerPCCPU *cpu)
 {
 CPUPPCState *env = >env;
diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
index bcec979..0267c6c 100644
--- a/target/s390x/kvm.c
+++ b/target/s390x/kvm.c
@@ -368,6 +368,16 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return 0;
 }
 
+int kvm_arch_destroy_vcpu(CPUState *cs)
+{
+S390CPU *cpu = S390_CPU(cs);
+
+g_free(cpu->irqstate);
+cpu->irqstate = NULL;
+
+return 0;
+}
+
 void kvm_s390_reset_vcpu(S390CPU *cpu)
 {
 CPUState *cs = CPU(cpu);
-- 
1.8.3.1

[Qemu-devel] [PULL 16/25] target/i386: kvm: Re-inject #DB to guest with updated DR6

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

If userspace (QEMU) debug guest, when #DB is raised in guest and
intercepted by KVM, KVM forwards information on #DB to userspace
instead of injecting #DB to guest.
While doing so, KVM don't update vCPU DR6 but instead report the #DB DR6
value to userspace for further handling.
See KVM's handle_exception() DB_VECTOR handler.

QEMU handler for this case is kvm_handle_debug(). This handler basically
checks if #DB is related to one of user set hardware breakpoints and if
not, it re-inject #DB into guest.
The re-injection is done by setting env->exception_injected to #DB which
will later be passed as events.exception.nr to KVM_SET_VCPU_EVENTS ioctl
by kvm_put_vcpu_events().

However, in case userspace re-injects #DB, KVM expects userspace to set
vCPU DR6 as reported to userspace when #DB was intercepted! Otherwise,
KVM_REQ_EVENT handler will inject #DB with wrong DR6 to guest.

Fix this issue by updating vCPU DR6 appropriately when re-inject #DB to
guest.

Reviewed-by: Nikita Leshenko 
Reviewed-by: Krish Sadhukhan 
Signed-off-by: Liran Alon 
Message-Id: <20190619162140.133674-5-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index c763643..9864aa0 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -3897,6 +3897,9 @@ static int kvm_handle_debug(X86CPU *cpu,
 /* pass to guest */
 env->exception_injected = arch_info->exception;
 env->has_error_code = 0;
+if (arch_info->exception == EXCP01_DB) {
+env->dr[6] = arch_info->dr6;
+}
 }
 
 return ret;
-- 
1.8.3.1

[Qemu-devel] [PULL 18/25] linux-headers: sync with latest KVM headers from Linux 5.2

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Improve the KVM_{GET,SET}_NESTED_STATE structs by detailing the format
of VMX nested state data in a struct.

In order to avoid changing the ioctl values of
KVM_{GET,SET}_NESTED_STATE, there is a need to preserve
sizeof(struct kvm_nested_state). This is done by defining the data
struct as "data.vmx[0]". It was the most elegant way I found to
preserve struct size while still keeping struct readable and easy to
maintain. It does have a misfortunate side-effect that now it has to be
accessed as "data.vmx[0]" rather than just "data.vmx".

Because we are already modifying these structs, I also modified the
following:
* Define the "format" field values as macros.
* Rename vmcs_pa to vmcs12_pa for better readability.

Signed-off-by: Liran Alon 
Reviewed-by: Maran Wilson 
Message-Id: <20190619162140.133674-7-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 linux-headers/asm-x86/kvm.h | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index 7a0e64c..6e7dd79 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -383,16 +383,26 @@ struct kvm_sync_regs {
 #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE  (1 << 2)
 #define KVM_X86_QUIRK_OUT_7E_INC_RIP   (1 << 3)
 
+#define KVM_STATE_NESTED_FORMAT_VMX0
+#define KVM_STATE_NESTED_FORMAT_SVM1
+
 #define KVM_STATE_NESTED_GUEST_MODE0x0001
 #define KVM_STATE_NESTED_RUN_PENDING   0x0002
 #define KVM_STATE_NESTED_EVMCS 0x0004
 
+#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000
+
 #define KVM_STATE_NESTED_SMM_GUEST_MODE0x0001
 #define KVM_STATE_NESTED_SMM_VMXON 0x0002
 
-struct kvm_vmx_nested_state {
+struct kvm_vmx_nested_state_data {
+   __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+   __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+};
+
+struct kvm_vmx_nested_state_hdr {
__u64 vmxon_pa;
-   __u64 vmcs_pa;
+   __u64 vmcs12_pa;
 
struct {
__u16 flags;
@@ -401,24 +411,25 @@ struct kvm_vmx_nested_state {
 
 /* for KVM_CAP_NESTED_STATE */
 struct kvm_nested_state {
-   /* KVM_STATE_* flags */
__u16 flags;
-
-   /* 0 for VMX, 1 for SVM.  */
__u16 format;
-
-   /* 128 for SVM, 128 + VMCS size for VMX.  */
__u32 size;
 
union {
-   /* VMXON, VMCS */
-   struct kvm_vmx_nested_state vmx;
+   struct kvm_vmx_nested_state_hdr vmx;
 
/* Pad the header to 128 bytes.  */
__u8 pad[120];
-   };
+   } hdr;
 
-   __u8 data[0];
+   /*
+* Define data region as 0 bytes to preserve backwards-compatability
+* to old definition of kvm_nested_state in order to avoid changing
+* KVM_{GET,PUT}_NESTED_STATE ioctl values.
+*/
+   union {
+   struct kvm_vmx_nested_state_data vmx[0];
+   } data;
 };
 
 #endif /* _ASM_X86_KVM_H */
-- 
1.8.3.1

[Qemu-devel] [PULL 19/25] vmstate: Add support for kernel integer types

2019-06-20 Thread Paolo Bonzini

From: Liran Alon 

Reviewed-by: Nikita Leshenko 
Reviewed-by: Maran Wilson 
Signed-off-by: Liran Alon 
Reviewed-by: Dr. David Alan Gilbert 
Message-Id: <20190619162140.133674-8-liran.a...@oracle.com>
Signed-off-by: Paolo Bonzini 
---
 include/migration/vmstate.h | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 9224370..ca68584 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -797,6 +797,19 @@ extern const VMStateInfo vmstate_info_qtailq;
 #define VMSTATE_UINT64_V(_f, _s, _v)  \
 VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64, uint64_t)
 
+#ifdef CONFIG_LINUX
+
+#define VMSTATE_U8_V(_f, _s, _v)   \
+VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint8, __u8)
+#define VMSTATE_U16_V(_f, _s, _v)  \
+VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint16, __u16)
+#define VMSTATE_U32_V(_f, _s, _v)  \
+VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint32, __u32)
+#define VMSTATE_U64_V(_f, _s, _v)  \
+VMSTATE_SINGLE(_f, _s, _v, vmstate_info_uint64, __u64)
+
+#endif
+
 #define VMSTATE_BOOL(_f, _s)  \
 VMSTATE_BOOL_V(_f, _s, 0)
 
@@ -818,6 +831,19 @@ extern const VMStateInfo vmstate_info_qtailq;
 #define VMSTATE_UINT64(_f, _s)\
 VMSTATE_UINT64_V(_f, _s, 0)
 
+#ifdef CONFIG_LINUX
+
+#define VMSTATE_U8(_f, _s) \
+VMSTATE_U8_V(_f, _s, 0)
+#define VMSTATE_U16(_f, _s)\
+VMSTATE_U16_V(_f, _s, 0)
+#define VMSTATE_U32(_f, _s)\
+VMSTATE_U32_V(_f, _s, 0)
+#define VMSTATE_U64(_f, _s)\
+VMSTATE_U64_V(_f, _s, 0)
+
+#endif
+
 #define VMSTATE_UINT8_EQUAL(_f, _s, _err_hint)\
 VMSTATE_SINGLE_FULL(_f, _s, 0, 0, \
 vmstate_info_uint8_equal, uint8_t, _err_hint)
-- 
1.8.3.1

[Qemu-devel] [PULL 04/25] i386/kvm: add support for KVM_GET_SUPPORTED_HV_CPUID

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

KVM now supports reporting supported Hyper-V features through CPUID
(KVM_GET_SUPPORTED_HV_CPUID ioctl). Going forward, this is going to be
the only way to announce new functionality and this has already happened
with Direct Mode stimers.

While we could just support KVM_GET_SUPPORTED_HV_CPUID for new features,
it seems to be beneficial to use it for all Hyper-V enlightenments when
possible. This way we can implement 'hv-all' pass-through mode giving the
guest all supported Hyper-V features even when QEMU knows nothing about
them.

Implementation-wise we create a new kvm_hyperv_properties structure
defining Hyper-V features, get_supported_hv_cpuid()/
get_supported_hv_cpuid_legacy() returning the supported CPUID set and
a bit over-engineered hv_cpuid_check_and_set() which we will also be
used to set cpu->hyperv_* properties for 'hv-all' mode.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-3-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 474 --
 1 file changed, 356 insertions(+), 118 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index b34eb81..cd492d4 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -683,156 +683,394 @@ static bool tsc_is_stable_and_known(CPUX86State *env)
 || env->user_tsc_khz;
 }
 
-static int hyperv_handle_properties(CPUState *cs)
+static struct {
+const char *desc;
+struct {
+uint32_t fw;
+uint32_t bits;
+} flags[2];
+} kvm_hyperv_properties[] = {
+[HYPERV_FEAT_RELAXED] = {
+.desc = "relaxed timing (hv-relaxed)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_HYPERCALL_AVAILABLE},
+{.fw = FEAT_HV_RECOMM_EAX,
+ .bits = HV_RELAXED_TIMING_RECOMMENDED}
+}
+},
+[HYPERV_FEAT_VAPIC] = {
+.desc = "virtual APIC (hv-vapic)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_HYPERCALL_AVAILABLE | HV_APIC_ACCESS_AVAILABLE},
+{.fw = FEAT_HV_RECOMM_EAX,
+ .bits = HV_APIC_ACCESS_RECOMMENDED}
+}
+},
+[HYPERV_FEAT_TIME] = {
+.desc = "clocksources (hv-time)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_HYPERCALL_AVAILABLE | HV_TIME_REF_COUNT_AVAILABLE |
+ HV_REFERENCE_TSC_AVAILABLE}
+}
+},
+[HYPERV_FEAT_CRASH] = {
+.desc = "crash MSRs (hv-crash)",
+.flags = {
+{.fw = FEAT_HYPERV_EDX,
+ .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
+}
+},
+[HYPERV_FEAT_RESET] = {
+.desc = "reset MSR (hv-reset)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_RESET_AVAILABLE}
+}
+},
+[HYPERV_FEAT_VPINDEX] = {
+.desc = "VP_INDEX MSR (hv-vpindex)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_VP_INDEX_AVAILABLE}
+}
+},
+[HYPERV_FEAT_RUNTIME] = {
+.desc = "VP_RUNTIME MSR (hv-runtime)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_VP_RUNTIME_AVAILABLE}
+}
+},
+[HYPERV_FEAT_SYNIC] = {
+.desc = "synthetic interrupt controller (hv-synic)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_SYNIC_AVAILABLE}
+}
+},
+[HYPERV_FEAT_STIMER] = {
+.desc = "synthetic timers (hv-stimer)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_SYNTIMERS_AVAILABLE}
+}
+},
+[HYPERV_FEAT_FREQUENCIES] = {
+.desc = "frequency MSRs (hv-frequencies)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_ACCESS_FREQUENCY_MSRS},
+{.fw = FEAT_HYPERV_EDX,
+ .bits = HV_FREQUENCY_MSRS_AVAILABLE}
+}
+},
+[HYPERV_FEAT_REENLIGHTENMENT] = {
+.desc = "reenlightenment MSRs (hv-reenlightenment)",
+.flags = {
+{.fw = FEAT_HYPERV_EAX,
+ .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
+}
+},
+[HYPERV_FEAT_TLBFLUSH] = {
+.desc = "paravirtualized TLB flush (hv-tlbflush)",
+.flags = {
+{.fw = FEAT_HV_RECOMM_EAX,
+ .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
+ HV_EX_PROCESSOR_MASKS_RECOMMENDED}
+}
+},
+[HYPERV_FEAT_EVMCS] = {
+.desc = "enlightened VMCS (hv-evmcs)",
+.flags = {
+{.fw = FEAT_HV_RECOMM_EAX,
+ .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
+}
+},
+[HYPERV_FEAT_IPI] = {
+.desc = "paravirtualized IPI (hv-ipi)",
+.flags = {
+{.fw = FEAT_HV_RECOMM_EAX,
+ .bits = HV_CLUSTER_IPI_RECOMMENDED |
+ HV_EX_PROCESSOR_MASKS_RECOMMENDED}
+}
+},
+};
+
+static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int

[Qemu-devel] [PULL 08/25] i386/kvm: hv-stimer requires hv-time and hv-synic

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Synthetic timers operate in hv-time time and Windows won't use these
without SynIC.

Add .dependencies field to kvm_hyperv_properties[] and a generic mechanism
to check dependencies between features.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-7-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 93ac6ba..58afa31 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -689,6 +689,7 @@ static struct {
 uint32_t fw;
 uint32_t bits;
 } flags[2];
+uint64_t dependencies;
 } kvm_hyperv_properties[] = {
 [HYPERV_FEAT_RELAXED] = {
 .desc = "relaxed timing (hv-relaxed)",
@@ -756,7 +757,8 @@ static struct {
 .flags = {
 {.fw = FEAT_HYPERV_EAX,
  .bits = HV_SYNTIMERS_AVAILABLE}
-}
+},
+.dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
 },
 [HYPERV_FEAT_FREQUENCIES] = {
 .desc = "frequency MSRs (hv-frequencies)",
@@ -986,12 +988,25 @@ static int hv_cpuid_check_and_set(CPUState *cs, struct 
kvm_cpuid2 *cpuid,
 X86CPU *cpu = X86_CPU(cs);
 CPUX86State *env = >env;
 uint32_t r, fw, bits;
-int i;
+uint64_t deps;
+int i, dep_feat = 0;
 
 if (!hyperv_feat_enabled(cpu, feature) && !cpu->hyperv_passthrough) {
 return 0;
 }
 
+deps = kvm_hyperv_properties[feature].dependencies;
+while ((dep_feat = find_next_bit(, 64, dep_feat)) < 64) {
+if (!(hyperv_feat_enabled(cpu, dep_feat))) {
+fprintf(stderr,
+"Hyper-V %s requires Hyper-V %s\n",
+kvm_hyperv_properties[feature].desc,
+kvm_hyperv_properties[dep_feat].desc);
+return 1;
+}
+dep_feat++;
+}
+
 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
 fw = kvm_hyperv_properties[feature].flags[i].fw;
 bits = kvm_hyperv_properties[feature].flags[i].bits;
@@ -1107,11 +1122,11 @@ static int hyperv_handle_properties(CPUState *cs,
 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_EVMCS);
 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_IPI);
 
-/* Dependencies */
+/* Additional dependencies not covered by kvm_hyperv_properties[] */
 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
 !cpu->hyperv_synic_kvm_only &&
 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
-fprintf(stderr, "Hyper-V %s requires %s\n",
+fprintf(stderr, "Hyper-V %s requires Hyper-V %s\n",
 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
 r |= 1;
-- 
1.8.3.1

[Qemu-devel] [PULL 10/25] i386/kvm: hv-evmcs requires hv-vapic

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Enlightened VMCS is enabled by writing to a field in VP assist page and
these require virtual APIC.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-9-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 1dfa282..86de510 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -790,7 +790,8 @@ static struct {
 .flags = {
 {.fw = FEAT_HV_RECOMM_EAX,
  .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
-}
+},
+.dependencies = BIT(HYPERV_FEAT_VAPIC)
 },
 [HYPERV_FEAT_IPI] = {
 .desc = "paravirtualized IPI (hv-ipi)",
-- 
1.8.3.1

[Qemu-devel] [PULL 24/25] util/main-loop: Fix incorrect assertion

2019-06-20 Thread Paolo Bonzini

From: Lidong Chen 

The check for poll_fds in g_assert() was incorrect. The correct assertion
should check "n_poll_fds + w->num <= ARRAY_SIZE(poll_fds)" because the
subsequent for-loop is doing access to poll_fds[n_poll_fds + i] where i
is in [0, w->num).  This could happen with a very high number of file
descriptors and/or wait objects.

Signed-off-by: Lidong Chen 
Suggested-by: Peter Maydell 
Suggested-by: Liam Merwick 
Reviewed-by: Liran Alon 
Reviewed-by: Darren Kenny 
Reviewed-by: Li Qiang 
Reviewed-by: Philippe Mathieu-Daudé 
Message-Id: 

Signed-off-by: Paolo Bonzini 
---
 util/main-loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/main-loop.c b/util/main-loop.c
index e1e349c..a9f4e8d 100644
--- a/util/main-loop.c
+++ b/util/main-loop.c
@@ -422,7 +422,7 @@ static int os_host_main_loop_wait(int64_t timeout)
 g_main_context_prepare(context, _priority);
 n_poll_fds = g_main_context_query(context, max_priority, _timeout,
   poll_fds, ARRAY_SIZE(poll_fds));
-g_assert(n_poll_fds <= ARRAY_SIZE(poll_fds));
+g_assert(n_poll_fds + w->num <= ARRAY_SIZE(poll_fds));
 
 for (i = 0; i < w->num; i++) {
 poll_fds[n_poll_fds + i].fd = (DWORD_PTR)w->events[i];
-- 
1.8.3.1

[Qemu-devel] [PULL 09/25] i386/kvm: hv-tlbflush/ipi require hv-vpindex

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

The corresponding hypercalls require using VP indexes.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-8-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 58afa31..1dfa282 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -782,7 +782,8 @@ static struct {
 {.fw = FEAT_HV_RECOMM_EAX,
  .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
  HV_EX_PROCESSOR_MASKS_RECOMMENDED}
-}
+},
+.dependencies = BIT(HYPERV_FEAT_VPINDEX)
 },
 [HYPERV_FEAT_EVMCS] = {
 .desc = "enlightened VMCS (hv-evmcs)",
@@ -797,7 +798,8 @@ static struct {
 {.fw = FEAT_HV_RECOMM_EAX,
  .bits = HV_CLUSTER_IPI_RECOMMENDED |
  HV_EX_PROCESSOR_MASKS_RECOMMENDED}
-}
+},
+.dependencies = BIT(HYPERV_FEAT_VPINDEX)
 },
 };
 
-- 
1.8.3.1

[Qemu-devel] [PULL 06/25] i386/kvm: document existing Hyper-V enlightenments

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Currently, there is no doc describing hv-* CPU flags, people are
encouraged to get the information from Microsoft Hyper-V Top Level
Functional specification (TLFS). There is, however, a bit of QEMU
specifics.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-5-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 docs/hyperv.txt | 181 
 1 file changed, 181 insertions(+)
 create mode 100644 docs/hyperv.txt

diff --git a/docs/hyperv.txt b/docs/hyperv.txt
new file mode 100644
index 000..c423e0f
--- /dev/null
+++ b/docs/hyperv.txt
@@ -0,0 +1,181 @@
+Hyper-V Enlightenments
+==
+
+
+1. Description
+===
+In some cases when implementing a hardware interface in software is slow, KVM
+implements its own paravirtualized interfaces. This works well for Linux as
+guest support for such features is added simultaneously with the feature 
itself.
+It may, however, be hard-to-impossible to add support for these interfaces to
+proprietary OSes, namely, Microsoft Windows.
+
+KVM on x86 implements Hyper-V Enlightenments for Windows guests. These features
+make Windows and Hyper-V guests think they're running on top of a Hyper-V
+compatible hypervisor and use Hyper-V specific features.
+
+
+2. Setup
+=
+No Hyper-V enlightenments are enabled by default by either KVM or QEMU. In
+QEMU, individual enlightenments can be enabled through CPU flags, e.g:
+
+  qemu-system-x86_64 --enable-kvm --cpu host,hv_relaxed,hv_vpindex,hv_time, ...
+
+Sometimes there are dependencies between enlightenments, QEMU is supposed to
+check that the supplied configuration is sane.
+
+When any set of the Hyper-V enlightenments is enabled, QEMU changes hypervisor
+identification (CPUID 0x4000..0x400A) to Hyper-V. KVM identification
+and features are kept in leaves 0x4100..0x4101.
+
+
+3. Existing enlightenments
+===
+
+3.1. hv-relaxed
+
+This feature tells guest OS to disable watchdog timeouts as it is running on a
+hypervisor. It is known that some Windows versions will do this even when they
+see 'hypervisor' CPU flag.
+
+3.2. hv-vapic
+==
+Provides so-called VP Assist page MSR to guest allowing it to work with APIC
+more efficiently. In particular, this enlightenment allows paravirtualized
+(exit-less) EOI processing.
+
+3.3. hv-spinlocks=xxx
+==
+Enables paravirtualized spinlocks. The parameter indicates how many times
+spinlock acquisition should be attempted before indicating the situation to the
+hypervisor. A special value 0x indicates "never to retry".
+
+3.4. hv-vpindex
+
+Provides HV_X64_MSR_VP_INDEX (0x4002) MSR to the guest which has Virtual
+processor index information. This enlightenment makes sense in conjunction with
+hv-synic, hv-stimer and other enlightenments which require the guest to know 
its
+Virtual Processor indices (e.g. when VP index needs to be passed in a
+hypercall).
+
+3.5. hv-runtime
+
+Provides HV_X64_MSR_VP_RUNTIME (0x4010) MSR to the guest. The MSR keeps the
+virtual processor run time in 100ns units. This gives guest operating system an
+idea of how much time was 'stolen' from it (when the virtual CPU was preempted
+to perform some other work).
+
+3.6. hv-crash
+==
+Provides HV_X64_MSR_CRASH_P0..HV_X64_MSR_CRASH_P5 (0x4100..0x4105) and
+HV_X64_MSR_CRASH_CTL (0x4105) MSRs to the guest. These MSRs are written to
+by the guest when it crashes, HV_X64_MSR_CRASH_P0..HV_X64_MSR_CRASH_P5 MSRs
+contain additional crash information. This information is outputted in QEMU log
+and through QAPI.
+Note: unlike under genuine Hyper-V, write to HV_X64_MSR_CRASH_CTL causes guest
+to shutdown. This effectively blocks crash dump generation by Windows.
+
+3.7. hv-time
+=
+Enables two Hyper-V-specific clocksources available to the guest: MSR-based
+Hyper-V clocksource (HV_X64_MSR_TIME_REF_COUNT, 0x4020) and Reference TSC
+page (enabled via MSR HV_X64_MSR_REFERENCE_TSC, 0x4021). Both clocksources
+are per-guest, Reference TSC page clocksource allows for exit-less time stamp
+readings. Using this enlightenment leads to significant speedup of all 
timestamp
+related operations.
+
+3.8. hv-synic
+==
+Enables Hyper-V Synthetic interrupt controller - an extension of a local APIC.
+When enabled, this enlightenment provides additional communication facilities
+to the guest: SynIC messages and Events. This is a pre-requisite for
+implementing VMBus devices (not yet in QEMU). Additionally, this enlightenment
+is needed to enable Hyper-V synthetic timers. SynIC is controlled through MSRs
+HV_X64_MSR_SCONTROL..HV_X64_MSR_EOM (0x4080..0x4084) and
+HV_X64_MSR_SINT0..HV_X64_MSR_SINT15 (0x4090..0x409F)
+
+Requires: hv-vpindex
+
+3.9. hv-stimer
+===
+Enables Hyper-V synthetic timers. There are

[Qemu-devel] [PULL 00/25] Misc (mostly x86) patches for 2019-06-21

2019-06-20 Thread Paolo Bonzini

The following changes since commit 33d609990621dea6c7d056c86f707b8811320ac1:

  Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging 
(2019-06-18 17:00:52 +0100)

are available in the git repository at:


  git://github.com/bonzini/qemu.git tags/for-upstream

for you to fetch changes up to 15e7ea4a8e0ed97361fefabb04c6a9aaebe321df:

  hw: Nuke hw_compat_4_0_1 and pc_compat_4_0_1 (2019-06-21 02:29:40 +0200)


* Nuke hw_compat_4_0_1 and pc_compat_4_0_1 (Greg)
* Static analysis fixes (Igor, Lidong)
* X86 Hyper-V CPUID improvements (Vitaly)
* X86 nested virt migration (Liran)
* New MSR-based features (Xiaoyao)


Colin Xu (1):
  hax: Honor CPUState::halted

Greg Kurz (1):
  hw: Nuke hw_compat_4_0_1 and pc_compat_4_0_1

Lidong Chen (2):
  sd: Fix out-of-bounds assertions
  util/main-loop: Fix incorrect assertion

Liran Alon (10):
  target/i386: kvm: Delete VMX migration blocker on vCPU init failure
  KVM: Introduce kvm_arch_destroy_vcpu()
  target/i386: kvm: Use symbolic constant for #DB/#BP exception constants
  target/i386: kvm: Re-inject #DB to guest with updated DR6
  target/i386: kvm: Block migration for vCPUs exposed with nested 
virtualization
  linux-headers: sync with latest KVM headers from Linux 5.2
  vmstate: Add support for kernel integer types
  target/i386: kvm: Add support for save and restore nested state
  target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD
  target/i386: kvm: Add nested migration blocker only when kernel lacks 
required capabilities

Vitaly Kuznetsov (9):
  i386/kvm: convert hyperv enlightenments properties from bools to bits
  i386/kvm: add support for KVM_GET_SUPPORTED_HV_CPUID
  i386/kvm: move Hyper-V CPUID filling to hyperv_handle_properties()
  i386/kvm: document existing Hyper-V enlightenments
  i386/kvm: implement 'hv-passthrough' mode
  i386/kvm: hv-stimer requires hv-time and hv-synic
  i386/kvm: hv-tlbflush/ipi require hv-vpindex
  i386/kvm: hv-evmcs requires hv-vapic
  i386/kvm: add support for Direct Mode for Hyper-V synthetic timers

Xiaoyao Li (1):
  target/i386: define a new MSR based feature word - FEAT_CORE_CAPABILITY

Yury Kotov (1):
  kvm-all: Add/update fprintf's for kvm_*_ioeventfd_del

 accel/kvm/kvm-all.c |  25 +-
 cpus.c  |   1 -
 docs/hyperv.txt | 201 +
 hw/core/machine.c   |   5 +-
 hw/i386/pc.c|   6 +-
 hw/i386/pc_q35.c|  12 +-
 hw/sd/sd.c  |   4 +-
 include/hw/boards.h |   3 -
 include/hw/i386/pc.h|   3 -
 include/migration/vmstate.h |  26 ++
 include/sysemu/kvm.h|   2 +
 linux-headers/asm-x86/kvm.h |  33 +-
 target/arm/kvm32.c  |   5 +
 target/arm/kvm64.c  |   5 +
 target/i386/cpu.c   |  81 +++-
 target/i386/cpu.h   |  75 +++-
 target/i386/hax-all.c   |  36 +-
 target/i386/hvf/hvf.c   |  10 +-
 target/i386/hvf/x86hvf.c|   4 +-
 target/i386/hyperv-proto.h  |   1 +
 target/i386/hyperv.c|   2 +-
 target/i386/kvm.c   | 999 +---
 target/i386/machine.c   | 284 -
 target/mips/kvm.c   |   5 +
 target/ppc/kvm.c|   5 +
 target/s390x/kvm.c  |  10 +
 util/main-loop.c|   2 +-
 27 files changed, 1516 insertions(+), 329 deletions(-)
 create mode 100644 docs/hyperv.txt
-- 
1.8.3.1

[Qemu-devel] [PULL 05/25] i386/kvm: move Hyper-V CPUID filling to hyperv_handle_properties()

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Let's consolidate Hyper-V features handling in hyperv_handle_properties().
The change is necessary to support 'hv-passthrough' mode as we'll be just
copying CPUIDs from KVM instead of filling them in.

Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-4-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 target/i386/kvm.c | 163 ++
 1 file changed, 90 insertions(+), 73 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index cd492d4..a041b4d 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1013,13 +1013,25 @@ static int hv_cpuid_check_and_set(CPUState *cs, struct 
kvm_cpuid2 *cpuid,
 return 0;
 }
 
-static int hyperv_handle_properties(CPUState *cs)
+/*
+ * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent in
+ * case of success, errno < 0 in case of failure and 0 when no Hyper-V
+ * extentions are enabled.
+ */
+static int hyperv_handle_properties(CPUState *cs,
+struct kvm_cpuid_entry2 *cpuid_ent)
 {
 X86CPU *cpu = X86_CPU(cs);
 CPUX86State *env = >env;
 struct kvm_cpuid2 *cpuid;
+struct kvm_cpuid_entry2 *c;
+uint32_t signature[3];
+uint32_t cpuid_i = 0;
 int r = 0;
 
+if (!hyperv_enabled(cpu))
+return 0;
+
 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
 uint16_t evmcs_version;
 
@@ -1068,9 +1080,80 @@ static int hyperv_handle_properties(CPUState *cs)
 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
 env->features[FEAT_HYPERV_EDX] |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
 
+if (r) {
+r = -ENOSYS;
+goto free;
+}
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+if (!cpu->hyperv_vendor_id) {
+memcpy(signature, "Microsoft Hv", 12);
+} else {
+size_t len = strlen(cpu->hyperv_vendor_id);
+
+if (len > 12) {
+error_report("hv-vendor-id truncated to 12 characters");
+len = 12;
+}
+memset(signature, 0, 12);
+memcpy(signature, cpu->hyperv_vendor_id, len);
+}
+c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
+HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
+c->ebx = signature[0];
+c->ecx = signature[1];
+c->edx = signature[2];
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_INTERFACE;
+memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
+c->eax = signature[0];
+c->ebx = 0;
+c->ecx = 0;
+c->edx = 0;
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_VERSION;
+c->eax = 0x1bbc;
+c->ebx = 0x00060001;
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_FEATURES;
+c->eax = env->features[FEAT_HYPERV_EAX];
+c->ebx = env->features[FEAT_HYPERV_EBX];
+c->edx = env->features[FEAT_HYPERV_EDX];
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_ENLIGHTMENT_INFO;
+c->eax = env->features[FEAT_HV_RECOMM_EAX];
+c->ebx = cpu->hyperv_spinlock_attempts;
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_IMPLEMENT_LIMITS;
+c->eax = cpu->hv_max_vps;
+c->ebx = 0x40;
+
+if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
+__u32 function;
+
+/* Create zeroed 0x4006..0x4009 leaves */
+for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
+ function < HV_CPUID_NESTED_FEATURES; function++) {
+c = _ent[cpuid_i++];
+c->function = function;
+}
+
+c = _ent[cpuid_i++];
+c->function = HV_CPUID_NESTED_FEATURES;
+c->eax = env->features[FEAT_HV_NESTED_EAX];
+}
+r = cpuid_i;
+
+free:
 g_free(cpuid);
 
-return r ? -ENOSYS : 0;
+return r;
 }
 
 static int hyperv_init_vcpu(X86CPU *cpu)
@@ -1179,79 +1262,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
 }
 
 /* Paravirtualization CPUIDs */
-if (hyperv_enabled(cpu)) {
-c = _data.entries[cpuid_i++];
-c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
-if (!cpu->hyperv_vendor_id) {
-memcpy(signature, "Microsoft Hv", 12);
-} else {
-size_t len = strlen(cpu->hyperv_vendor_id);
-
-if (len > 12) {
-error_report("hv-vendor-id truncated to 12 characters");
-len = 12;
-}
-memset(signature, 0, 12);
-memcpy(signature, cpu->hyperv_vendor_id, len);
-}
-c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
-HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
-c->ebx = signature[0];
-c->ecx = signature[1];
-c->edx = signature[2];
-
-c = _data.entries[cpuid_i++];
-c->function = HV_CPUID_INTERFACE;
-memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
-c->eax = signature[0];
-c->ebx = 0;
-c->ecx = 0;
-c->edx = 0;
-
-c =

[Qemu-devel] [PULL 01/25] kvm-all: Add/update fprintf's for kvm_*_ioeventfd_del

2019-06-20 Thread Paolo Bonzini

From: Yury Kotov 

Signed-off-by: Yury Kotov 
Message-Id: <20190607090830.18807-1-yury-ko...@yandex-team.ru>
Signed-off-by: Paolo Bonzini 
---
 accel/kvm/kvm-all.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index b0c4bed..d2f481a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -863,8 +863,8 @@ static void kvm_mem_ioeventfd_add(MemoryListener *listener,
data, true, int128_get64(section->size),
match_data);
 if (r < 0) {
-fprintf(stderr, "%s: error adding ioeventfd: %s\n",
-__func__, strerror(-r));
+fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
+__func__, strerror(-r), -r);
 abort();
 }
 }
@@ -881,6 +881,8 @@ static void kvm_mem_ioeventfd_del(MemoryListener *listener,
data, false, int128_get64(section->size),
match_data);
 if (r < 0) {
+fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
+__func__, strerror(-r), -r);
 abort();
 }
 }
@@ -897,8 +899,8 @@ static void kvm_io_ioeventfd_add(MemoryListener *listener,
   data, true, int128_get64(section->size),
   match_data);
 if (r < 0) {
-fprintf(stderr, "%s: error adding ioeventfd: %s\n",
-__func__, strerror(-r));
+fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
+__func__, strerror(-r), -r);
 abort();
 }
 }
@@ -916,6 +918,8 @@ static void kvm_io_ioeventfd_del(MemoryListener *listener,
   data, false, int128_get64(section->size),
   match_data);
 if (r < 0) {
+fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
+__func__, strerror(-r), -r);
 abort();
 }
 }
-- 
1.8.3.1

[Qemu-devel] [PULL 03/25] i386/kvm: convert hyperv enlightenments properties from bools to bits

2019-06-20 Thread Paolo Bonzini

From: Vitaly Kuznetsov 

Representing Hyper-V properties as bits will allow us to check features
and dependencies between them in a natural way.

Suggested-by: Roman Kagan 
Signed-off-by: Vitaly Kuznetsov 
Message-Id: <20190517141924.19024-2-vkuzn...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 hw/i386/pc.c  |  3 +-
 target/i386/cpu.c | 44 ++-
 target/i386/cpu.h | 37 ++-
 target/i386/hyperv.c  |  2 +-
 target/i386/kvm.c | 83 +--
 target/i386/machine.c |  2 +-
 6 files changed, 91 insertions(+), 80 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 2c5446b..e41192b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -2386,7 +2386,8 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
 }
 cpu->thread_id = topo.smt_id;
 
-if (cpu->hyperv_vpindex && !kvm_hv_vpindex_settable()) {
+if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) &&
+!kvm_hv_vpindex_settable()) {
 error_setg(errp, "kernel doesn't allow setting HyperV VP_INDEX");
 return;
 }
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index fbed2eb..e90c1ac 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5853,21 +5853,37 @@ static Property x86_cpu_properties[] = {
 #endif
 DEFINE_PROP_INT32("node-id", X86CPU, node_id, CPU_UNSET_NUMA_NODE_ID),
 DEFINE_PROP_BOOL("pmu", X86CPU, enable_pmu, false),
+
 { .name  = "hv-spinlocks", .info  = _prop_spinlocks },
-DEFINE_PROP_BOOL("hv-relaxed", X86CPU, hyperv_relaxed_timing, false),
-DEFINE_PROP_BOOL("hv-vapic", X86CPU, hyperv_vapic, false),
-DEFINE_PROP_BOOL("hv-time", X86CPU, hyperv_time, false),
-DEFINE_PROP_BOOL("hv-crash", X86CPU, hyperv_crash, false),
-DEFINE_PROP_BOOL("hv-reset", X86CPU, hyperv_reset, false),
-DEFINE_PROP_BOOL("hv-vpindex", X86CPU, hyperv_vpindex, false),
-DEFINE_PROP_BOOL("hv-runtime", X86CPU, hyperv_runtime, false),
-DEFINE_PROP_BOOL("hv-synic", X86CPU, hyperv_synic, false),
-DEFINE_PROP_BOOL("hv-stimer", X86CPU, hyperv_stimer, false),
-DEFINE_PROP_BOOL("hv-frequencies", X86CPU, hyperv_frequencies, false),
-DEFINE_PROP_BOOL("hv-reenlightenment", X86CPU, hyperv_reenlightenment, 
false),
-DEFINE_PROP_BOOL("hv-tlbflush", X86CPU, hyperv_tlbflush, false),
-DEFINE_PROP_BOOL("hv-evmcs", X86CPU, hyperv_evmcs, false),
-DEFINE_PROP_BOOL("hv-ipi", X86CPU, hyperv_ipi, false),
+DEFINE_PROP_BIT64("hv-relaxed", X86CPU, hyperv_features,
+  HYPERV_FEAT_RELAXED, 0),
+DEFINE_PROP_BIT64("hv-vapic", X86CPU, hyperv_features,
+  HYPERV_FEAT_VAPIC, 0),
+DEFINE_PROP_BIT64("hv-time", X86CPU, hyperv_features,
+  HYPERV_FEAT_TIME, 0),
+DEFINE_PROP_BIT64("hv-crash", X86CPU, hyperv_features,
+  HYPERV_FEAT_CRASH, 0),
+DEFINE_PROP_BIT64("hv-reset", X86CPU, hyperv_features,
+  HYPERV_FEAT_RESET, 0),
+DEFINE_PROP_BIT64("hv-vpindex", X86CPU, hyperv_features,
+  HYPERV_FEAT_VPINDEX, 0),
+DEFINE_PROP_BIT64("hv-runtime", X86CPU, hyperv_features,
+  HYPERV_FEAT_RUNTIME, 0),
+DEFINE_PROP_BIT64("hv-synic", X86CPU, hyperv_features,
+  HYPERV_FEAT_SYNIC, 0),
+DEFINE_PROP_BIT64("hv-stimer", X86CPU, hyperv_features,
+  HYPERV_FEAT_STIMER, 0),
+DEFINE_PROP_BIT64("hv-frequencies", X86CPU, hyperv_features,
+  HYPERV_FEAT_FREQUENCIES, 0),
+DEFINE_PROP_BIT64("hv-reenlightenment", X86CPU, hyperv_features,
+  HYPERV_FEAT_REENLIGHTENMENT, 0),
+DEFINE_PROP_BIT64("hv-tlbflush", X86CPU, hyperv_features,
+  HYPERV_FEAT_TLBFLUSH, 0),
+DEFINE_PROP_BIT64("hv-evmcs", X86CPU, hyperv_features,
+  HYPERV_FEAT_EVMCS, 0),
+DEFINE_PROP_BIT64("hv-ipi", X86CPU, hyperv_features,
+  HYPERV_FEAT_IPI, 0),
+
 DEFINE_PROP_BOOL("check", X86CPU, check_cpuid, true),
 DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
 DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 0732e05..30cd1a0 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -734,6 +734,22 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY (1U << 3)
 #define MSR_ARCH_CAP_SSB_NO (1U << 4)
 
+/* Supported Hyper-V Enlightenments */
+#define HYPERV_FEAT_RELAXED 0
+#define HYPERV_FEAT_VAPIC   1
+#define HYPERV_FEAT_TIME2
+#define HYPERV_FEAT_CRASH   3
+#define HYPERV_FEAT_RESET   4
+#define HYPERV_FEAT_VPINDEX 5
+#define HYPERV_FEAT_RUNTIME 6
+#define HYPERV_FEAT_SYNIC   7
+#define HYPERV_FEAT_STIMER  8
+#define HYPERV_FEAT_FREQUENCIES 9
+#define HYPERV_FEAT_REENLIGHTENMENT 10
+#define

[Qemu-devel] [PULL 02/25] hax: Honor CPUState::halted

2019-06-20 Thread Paolo Bonzini

From: Colin Xu 

QEMU tracks whether a vcpu is halted using CPUState::halted. E.g.,
after initialization or reset, halted is 0 for the BSP (vcpu 0)
and 1 for the APs (vcpu 1, 2, ...). A halted vcpu should not be
handed to the hypervisor to run (e.g. hax_vcpu_run()).

Under HAXM, Android Emulator sometimes boots into a "vcpu shutdown
request" error while executing in SeaBIOS, with the HAXM driver
logging a guest triple fault in vcpu 1, 2, ... at RIP 0x3. That is
ultimately because the HAX accelerator asks HAXM to run those APs
when they are still in the halted state.

Normally, the vcpu thread for an AP will start by looping in
qemu_wait_io_event(), until the BSP kicks it via a pair of IPIs
(INIT followed by SIPI). But because the HAX accelerator does not
honor cpu->halted, it allows the AP vcpu thread to proceed to
hax_vcpu_run() as soon as it receives any kick, even if the kick
does not come from the BSP. It turns out that emulator has a
worker thread which periodically kicks every vcpu thread (possibly
to collect CPU usage data), and if one of these kicks comes before
those by the BSP, the AP will start execution from the wrong RIP,
resulting in the aforementioned SMP boot failure.

The solution is inspired by the KVM accelerator (credit to
Chuanxiao Dong  for the pointer):

1. Get rid of questionable logic that unconditionally resets
   cpu->halted before hax_vcpu_run(). Instead, only reset it at the
   right moments (there are only a few "unhalt" events).
2. Add a check for cpu->halted before hax_vcpu_run().

Note that although the non-Unrestricted Guest (!ug_platform) code
path also forcibly resets cpu->halted, it is left untouched,
because only the UG code path supports SMP guests.

The patch is first merged to android emulator with Change-Id:
I9c5752cc737fd305d7eace1768ea12a07309d716

Cc: Yu Ning 
Cc: Chuanxiao Dong 
Signed-off-by: Colin Xu 
Message-Id: <20190610021939.13669-1-colin...@intel.com>
---
 cpus.c|  1 -
 target/i386/hax-all.c | 36 ++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/cpus.c b/cpus.c
index dde3b7b..1af51b7 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1594,7 +1594,6 @@ static void *qemu_hax_cpu_thread_fn(void *arg)
 
 cpu->thread_id = qemu_get_thread_id();
 cpu->created = true;
-cpu->halted = 0;
 current_cpu = cpu;
 
 hax_init_vcpu(cpu);
diff --git a/target/i386/hax-all.c b/target/i386/hax-all.c
index 64fd51a..9e7b779 100644
--- a/target/i386/hax-all.c
+++ b/target/i386/hax-all.c
@@ -471,13 +471,35 @@ static int hax_vcpu_hax_exec(CPUArchState *env)
 return 0;
 }
 
-cpu->halted = 0;
-
 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
 apic_poll_irq(x86_cpu->apic_state);
 }
 
+/* After a vcpu is halted (either because it is an AP and has just been
+ * reset, or because it has executed the HLT instruction), it will not be
+ * run (hax_vcpu_run()) until it is unhalted. The next few if blocks check
+ * for events that may change the halted state of this vcpu:
+ *  a) Maskable interrupt, when RFLAGS.IF is 1;
+ * Note: env->eflags may not reflect the current RFLAGS state, because
+ *   it is not updated after each hax_vcpu_run(). We cannot afford
+ *   to fail to recognize any unhalt-by-maskable-interrupt event
+ *   (in which case the vcpu will halt forever), and yet we cannot
+ *   afford the overhead of hax_vcpu_sync_state(). The current
+ *   solution is to err on the side of caution and have the HLT
+ *   handler (see case HAX_EXIT_HLT below) unconditionally set the
+ *   IF_MASK bit in env->eflags, which, in effect, disables the
+ *   RFLAGS.IF check.
+ *  b) NMI;
+ *  c) INIT signal;
+ *  d) SIPI signal.
+ */
+if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+ (env->eflags & IF_MASK)) ||
+(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+cpu->halted = 0;
+}
+
 if (cpu->interrupt_request & CPU_INTERRUPT_INIT) {
 DPRINTF("\nhax_vcpu_hax_exec: handling INIT for %d\n",
 cpu->cpu_index);
@@ -493,6 +515,16 @@ static int hax_vcpu_hax_exec(CPUArchState *env)
 hax_vcpu_sync_state(env, 1);
 }
 
+if (cpu->halted) {
+/* If this vcpu is halted, we must not ask HAXM to run it. Instead, we
+ * break out of hax_smp_cpu_exec() as if this vcpu had executed HLT.
+ * That way, this vcpu thread will be trapped in qemu_wait_io_event(),
+ * until the vcpu is unhalted.
+ */
+cpu->exception_index = EXCP_HLT;
+return 0;
+}
+
 do {
 int hax_ret;
 
-- 
1.8.3.1

Re: [Qemu-devel] [PATCH v4 00/13] Add migration support for VFIO device

2019-06-20 Thread Yan Zhao

On Fri, Jun 21, 2019 at 08:25:18AM +0800, Yan Zhao wrote:
> On Thu, Jun 20, 2019 at 10:37:28PM +0800, Kirti Wankhede wrote:
> > Add migration support for VFIO device
> > 
> > This Patch set include patches as below:
> > - Define KABI for VFIO device for migration support.
> > - Added save and restore functions for PCI configuration space
> > - Generic migration functionality for VFIO device.
> >   * This patch set adds functionality only for PCI devices, but can be
> > extended to other VFIO devices.
> >   * Added all the basic functions required for pre-copy, stop-and-copy and
> > resume phases of migration.
> >   * Added state change notifier and from that notifier function, VFIO
> > device's state changed is conveyed to VFIO device driver.
> >   * During save setup phase and resume/load setup phase, migration region
> > is queried and is used to read/write VFIO device data.
> >   * .save_live_pending and .save_live_iterate are implemented to use QEMU's
> > functionality of iteration during pre-copy phase.
> >   * In .save_live_complete_precopy, that is in stop-and-copy phase,
> > iteration to read data from VFIO device driver is implemented till 
> > pending
> > bytes returned by driver are not zero.
> >   * Added function to get dirty pages bitmap for the pages which are used by
> > driver.
> > - Add vfio_listerner_log_sync to mark dirty pages.
> > - Make VFIO PCI device migration capable. If migration region is not 
> > provided by
> >   driver, migration is blocked.
> > 
> > Below is the flow of state change for live migration where states in 
> > brackets
> > represent VM state, migration state and VFIO device state as:
> > (VM state, MIGRATION_STATUS, VFIO_DEVICE_STATE)
> > 
> > Live migration save path:
> > QEMU normal running state
> > (RUNNING, _NONE, _RUNNING)
> > |
> > migrate_init spawns migration_thread.
> > (RUNNING, _SETUP, _RUNNING|_SAVING)
> > Migration thread then calls each device's .save_setup()
> > |
> > (RUNNING, _ACTIVE, _RUNNING|_SAVING)
> > If device is active, get pending bytes by .save_live_pending()
> > if pending bytes >= threshold_size,  call save_live_iterate()
> > Data of VFIO device for pre-copy phase is copied.
> > Iterate till pending bytes converge and are less than threshold
> > |
> > On migration completion, vCPUs stops and calls 
> > .save_live_complete_precopy
> > for each active device. VFIO device is then transitioned in
> >  _SAVING state.
> > (FINISH_MIGRATE, _DEVICE, _SAVING)
> > For VFIO device, iterate in  .save_live_complete_precopy  until
> > pending data is 0.
> > (FINISH_MIGRATE, _DEVICE, _STOPPED)
> 
> I suggest we also register to VMStateDescription, whose .pre_save
> handler would get called after .save_live_complete_precopy in pre-copy
> only case, and will called before .save_live_iterate in post-copy
> enabled case.
> In the .pre_save handler, we can save all device state which must be
> copied after device stop in source vm and before device start in target vm.
> 
hi
to better describe this idea:

in pre-copy only case, the flow is

start migration --> .save_live_iterate (several round) -> stop source vm
--> .save_live_complete_precopy --> .pre_save  -->start target vm
-->migration complete


in post-copy enabled case, the flow is

start migration --> .save_live_iterate (several round) --> start post copy --> 
stop source vm --> .pre_save --> start target vm --> .save_live_iterate 
(several round) 
-->migration complete

Therefore, we should put saving of device state in .pre_save interface
rather than in .save_live_complete_precopy. 
The device state includes pci config data, page tables, register state, etc.

The .save_live_iterate and .save_live_complete_precopy should only deal
with saving dirty memory.


I know current implementation does not support post-copy. but at least
it should not require huge change when we decide to enable it in future.

Thanks
Yan

> > |
> > (FINISH_MIGRATE, _COMPLETED, STOPPED)
> > Migraton thread schedule cleanup bottom half and exit
> > 
> > Live migration resume path:
> > Incomming migration calls .load_setup for each device
> > (RESTORE_VM, _ACTIVE, STOPPED)
> > |
> > For each device, .load_state is called for that device section data
> > |
> > At the end, called .load_cleanup for each device and vCPUs are started.
> > |
> > (RUNNING, _NONE, _RUNNING)
> > 
> > Note that:
> > - Migration post copy is not supported.
> > 
> > v3 -> v4:
> > - Added one more bit for _RESUMING flag to be set explicitly.
> > - data_offset field is read-only for user space application.
> > - data_size is read for every iteration before reading data from migration, 
> > that
> >   is removed assumption that data will be till end

Re: [Qemu-devel] [RFC PATCH 0/9] hw/acpi: make build_madt arch agnostic

2019-06-20 Thread Wei Yang

On Thu, Jun 20, 2019 at 05:04:29PM +0200, Igor Mammedov wrote:
>On Thu, 20 Jun 2019 14:18:42 +
>Wei Yang  wrote:
>
>> On Wed, Jun 19, 2019 at 11:04:40AM +0200, Igor Mammedov wrote:
>> >On Wed, 19 Jun 2019 14:20:50 +0800
>> >Wei Yang  wrote:
>> >  
>> >> On Tue, Jun 18, 2019 at 05:59:56PM +0200, Igor Mammedov wrote:  
>> >> >
>> >> >On Mon, 13 May 2019 14:19:04 +0800
>> >> >Wei Yang  wrote:
>> >> >
>> >> >> Now MADT is highly depend in architecture and machine type and leaves
>> >> >> duplicated code in different architecture. The series here tries to 
>> >> >> generalize
>> >> >> it.
>> >> >> 
>> >> >> MADT contains one main table and several sub tables. These sub tables 
>> >> >> are
>> >> >> highly related to architecture. Here we introduce one method to make it
>> >> >> architecture agnostic.
>> >> >> 
>> >> >>   * each architecture define its sub-table implementation function in 
>> >> >> madt_sub
>> >> >>   * introduces struct madt_input to collect sub table information and 
>> >> >> pass to
>> >> >> build_madt
>> >> >> 
>> >> >> By doing so, each architecture could prepare its own sub-table 
>> >> >> implementation
>> >> >> and madt_input. And keep build_madt architecture agnostic.
>> >> >
>> >> >I've skimmed over patches, and to me it looks mostly as code movement
>> >> >without apparent benefits and probably a bit more complex than what we 
>> >> >have now
>> >> >(it might be ok cost if it simplifies MADT support for other boards).
>> >> >
>> >> >Before I do line by line review could you demonstrate what effect new way
>> >> >to build MADT would have on arm/virt and i386/virt (from NEMU). So it 
>> >> >would be
>> >> >possible to estimate net benefits from new approach?
>> >> >(PS: it doesn't have to be patches ready for merging, just a dirty hack
>> >> >that would demonstrate adding MADT for new board using mad_sub[])
>> >> >
>> >> 
>> >> Per APIC spec 5.2.12, MADT contains a *main* table and several *sub* 
>> >> tables
>> >> (Interrupt Controllere), so the idea is give a callback hook in
>> >> AcpiDeviceIfClass for each table, including *main* and *sub* table.
>> >> 
>> >> Current AcpiDeviceIfClass has one callback pc_madt_cpu_entry for some 
>> >> *sub*
>> >> tables, after replacing the AcpiDeviceIfClass will look like this:
>> >> 
>> >> typedef struct AcpiDeviceIfClass {
>> >> /*  */
>> >> InterfaceClass parent_class;
>> >> 
>> >> /*  */
>> >> void (*ospm_status)(AcpiDeviceIf *adev, ACPIOSTInfoList ***list);
>> >> void (*send_event)(AcpiDeviceIf *adev, AcpiEventStatusBits ev);
>> >> -   void (*madt_cpu)(AcpiDeviceIf *adev, int uid,
>> >> -const CPUArchIdList *apic_ids, GArray *entry);
>> >> +   madt_operation madt_main;
>> >> +   madt_operation *madt_sub;
>> >> } AcpiDeviceIfClass;
>> >> 
>> >> By doing so, each arch could have its own implementation for MADT.
>> >> 
>> >> After this refactoring, build_madt could be simplified to:
>> >> 
>> >> build_madt(GArray *table_data, BIOSLinker *linker, PCMachineState *pcms,
>> >>struct madt_input *input)
>> >> {
>> >> ...
>> >> 
>> >> if (adevc->madt_main) {
>> >> adevc->madt_main(table_data, madt);
>> >> }
>> >> 
>> >> for (i = 0; ; i++) {
>> >> sub_id = input[i].sub_id;
>> >> if (sub_id == ACPI_APIC_RESERVED) {
>> >> break;
>> >> }
>> >> opaque = input[i].opaque;
>> >> adevc->madt_sub[sub_id](table_data, opaque);
>> >> }
>> >> 
>> >> ...
>> >> }
>> >> 
>> >> input is a list of data necessary to build *sub* table. Its details is 
>> >> also
>> >> arch dependent.  
>> >I've got general idea reading patches in this series.
>> >As I've mentioned before it's hard to generalize MADT since it
>> >mostly contains entries unique for target/board.
>> >Goal here isn't generalizing at any cost, but rather find out
>> >if there is enough common code to justify generalization
>> >and if it allows us to reduce code duplication and simplify.
>> >  
>> >> For following new arch, what it need to do is prepare the input array and
>> >> implement necessary *main*/*sub* table callbacks.  
>> >What I'd like to see is the actual patch that does this,
>> >to see if it has any merit and to compare to the current
>> >approach.  
>> 
>> I didn't get some idea about your approach. Would you mind sharing more 
>> light?
>With current approach, 'each board' has its own MADT build routine.
>Considering that there is very little to share between different
>implementations it might be ok.
>
>This series just add extra data structure for board to populate
>and a bunch of callbacks for every record type. Essentially all
>the code we have now is still there. It was just moved elsewhere
>and made available via callbacks.

Yes, you are right.

>This series touches only pc/q35 machines and it's not apparent
>to me why it's any better than what we have now.

This is the demo for i386. In case you think this approach is reasonable, it

Re: [Qemu-devel] [PATCH 1/2] monitor: Add dump-stack command

2019-06-20 Thread Suraj Jitindar Singh

On Wed, 2019-05-01 at 15:35 +1000, Suraj Jitindar Singh wrote:
> Add a monitor command "dump-stack" to be used to dump the stack for
> the
> current cpu.

To summarise the discussion which occured on this patch,

- It looks like it's ok to duplicate this functionality as it provides
an easier method to achieve this in the field and also for development.
- It's ok for this to remain as a separate command and to not place it
as a subcommand under info.

I'll rework based on the comments on 2/2 of the series and resend.

Thanks,
Suraj

> 
> Signed-off-by: Suraj Jitindar Singh 
> ---
>  hmp-commands.hx   | 13 +
>  hmp.h |  1 +
>  include/qom/cpu.h | 10 ++
>  monitor.c | 12 
>  qom/cpu.c | 10 ++
>  5 files changed, 46 insertions(+)
> 
> diff --git a/hmp-commands.hx b/hmp-commands.hx
> index 9b4035965c..965ccdea28 100644
> --- a/hmp-commands.hx
> +++ b/hmp-commands.hx
> @@ -862,6 +862,19 @@ ETEXI
>  },
>  
>  STEXI
> +@item dump-stack
> +@findex dump-stack
> +dump stack of the cpu
> +ETEXI
> +{
> +.name   = "dump-stack",
> +.args_type  = "",
> +.params = "",
> +.help   = "dump stack",
> +.cmd= hmp_dumpstack,
> +},
> +
> +STEXI
>  @item pmemsave @var{addr} @var{size} @var{file}
>  @findex pmemsave
>  save to disk physical memory dump starting at @var{addr} of size
> @var{size}.
> diff --git a/hmp.h b/hmp.h
> index 43617f2646..e6edf1215c 100644
> --- a/hmp.h
> +++ b/hmp.h
> @@ -51,6 +51,7 @@ void hmp_announce_self(Monitor *mon, const QDict
> *qdict);
>  void hmp_cpu(Monitor *mon, const QDict *qdict);
>  void hmp_memsave(Monitor *mon, const QDict *qdict);
>  void hmp_pmemsave(Monitor *mon, const QDict *qdict);
> +void hmp_dumpstack(Monitor *mon, const QDict *qdict);
>  void hmp_ringbuf_write(Monitor *mon, const QDict *qdict);
>  void hmp_ringbuf_read(Monitor *mon, const QDict *qdict);
>  void hmp_cont(Monitor *mon, const QDict *qdict);
> diff --git a/include/qom/cpu.h b/include/qom/cpu.h
> index 08abcbd3fe..f2e83e9918 100644
> --- a/include/qom/cpu.h
> +++ b/include/qom/cpu.h
> @@ -181,6 +181,7 @@ typedef struct CPUClass {
>  int (*memory_rw_debug)(CPUState *cpu, vaddr addr,
> uint8_t *buf, int len, bool is_write);
>  void (*dump_state)(CPUState *cpu, FILE *, int flags);
> +void (*dump_stack)(CPUState *cpu, FILE *f);
>  GuestPanicInformation* (*get_crash_info)(CPUState *cpu);
>  void (*dump_statistics)(CPUState *cpu, int flags);
>  int64_t (*get_arch_id)(CPUState *cpu);
> @@ -568,6 +569,15 @@ enum CPUDumpFlags {
>  void cpu_dump_state(CPUState *cpu, FILE *f, int flags);
>  
>  /**
> + * cpu_dump_stack:
> + * @cpu: The CPU whose stack is to be dumped.
> + * @f: If non-null, dump to this stream, else to current print sink.
> + *
> + * Dumps CPU stack.
> + */
> +void cpu_dump_stack(CPUState *cpu, FILE *f);
> +
> +/**
>   * cpu_dump_statistics:
>   * @cpu: The CPU whose state is to be dumped.
>   * @flags: Flags what to dump.
> diff --git a/monitor.c b/monitor.c
> index 9b5f10b475..dbec2e4376 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -1299,6 +1299,18 @@ static void hmp_info_registers(Monitor *mon,
> const QDict *qdict)
>  }
>  }
>  
> +void hmp_dumpstack(Monitor *mon, const QDict *qdict)
> +{
> +CPUState *cs = mon_get_cpu();
> +
> +if (!cs) {
> +monitor_printf(mon, "No CPU available\n");
> +return;
> +}
> +
> +cpu_dump_stack(cs, NULL);
> +}
> +
>  #ifdef CONFIG_TCG
>  static void hmp_info_jit(Monitor *mon, const QDict *qdict)
>  {
> diff --git a/qom/cpu.c b/qom/cpu.c
> index 3c5493c96c..0dc10004f4 100644
> --- a/qom/cpu.c
> +++ b/qom/cpu.c
> @@ -230,6 +230,16 @@ void cpu_dump_state(CPUState *cpu, FILE *f, int
> flags)
>  }
>  }
>  
> +void cpu_dump_stack(CPUState *cpu, FILE *f)
> +{
> +CPUClass *cc = CPU_GET_CLASS(cpu);
> +
> +if (cc->dump_stack) {
> +cpu_synchronize_state(cpu);
> +cc->dump_stack(cpu, f);
> +}
> +}
> +
>  void cpu_dump_statistics(CPUState *cpu, int flags)
>  {
>  CPUClass *cc = CPU_GET_CLASS(cpu);

Re: [Qemu-devel] [PATCH v3 05/10] hw/riscv: Replace global smp variables with machine smp properties

2019-06-20 Thread Like Xu


On 2019/6/20 22:52, Eduardo Habkost wrote:

On Sun, May 19, 2019 at 04:54:23AM +0800, Like Xu wrote:

The global smp variables in riscv are replaced with smp machine properties.

A local variable of the same name would be introduced in the declaration
phase if it's used widely in the context OR replace it on the spot if it's
only used once. No semantic changes.

Signed-off-by: Like Xu 
---
  hw/riscv/sifive_e.c| 6 --
  hw/riscv/sifive_plic.c | 3 +++
  hw/riscv/sifive_u.c| 6 --
  hw/riscv/spike.c   | 2 ++
  hw/riscv/virt.c| 1 +
  5 files changed, 14 insertions(+), 4 deletions(-)


This was incomplete, I had to apply the following fixup.

Signed-off-by: Eduardo Habkost 


Reviewed-by: Like Xu 


---
  hw/riscv/spike.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c
index 9e95f2c13c..d91d49dcae 100644
--- a/hw/riscv/spike.c
+++ b/hw/riscv/spike.c
@@ -172,6 +172,7 @@ static void spike_board_init(MachineState *machine)
  MemoryRegion *main_mem = g_new(MemoryRegion, 1);
  MemoryRegion *mask_rom = g_new(MemoryRegion, 1);
  int i;
+unsigned int smp_cpus = machine->smp.cpus;
  
  /* Initialize SOC */

  object_initialize_child(OBJECT(machine), "soc", >soc, sizeof(s->soc),

Re: [Qemu-devel] [PATCH v4 00/13] Add migration support for VFIO device

2019-06-20 Thread Yan Zhao

On Thu, Jun 20, 2019 at 10:37:28PM +0800, Kirti Wankhede wrote:
> Add migration support for VFIO device
> 
> This Patch set include patches as below:
> - Define KABI for VFIO device for migration support.
> - Added save and restore functions for PCI configuration space
> - Generic migration functionality for VFIO device.
>   * This patch set adds functionality only for PCI devices, but can be
> extended to other VFIO devices.
>   * Added all the basic functions required for pre-copy, stop-and-copy and
> resume phases of migration.
>   * Added state change notifier and from that notifier function, VFIO
> device's state changed is conveyed to VFIO device driver.
>   * During save setup phase and resume/load setup phase, migration region
> is queried and is used to read/write VFIO device data.
>   * .save_live_pending and .save_live_iterate are implemented to use QEMU's
> functionality of iteration during pre-copy phase.
>   * In .save_live_complete_precopy, that is in stop-and-copy phase,
> iteration to read data from VFIO device driver is implemented till pending
> bytes returned by driver are not zero.
>   * Added function to get dirty pages bitmap for the pages which are used by
> driver.
> - Add vfio_listerner_log_sync to mark dirty pages.
> - Make VFIO PCI device migration capable. If migration region is not provided 
> by
>   driver, migration is blocked.
> 
> Below is the flow of state change for live migration where states in brackets
> represent VM state, migration state and VFIO device state as:
> (VM state, MIGRATION_STATUS, VFIO_DEVICE_STATE)
> 
> Live migration save path:
> QEMU normal running state
> (RUNNING, _NONE, _RUNNING)
> |
> migrate_init spawns migration_thread.
> (RUNNING, _SETUP, _RUNNING|_SAVING)
> Migration thread then calls each device's .save_setup()
> |
> (RUNNING, _ACTIVE, _RUNNING|_SAVING)
> If device is active, get pending bytes by .save_live_pending()
> if pending bytes >= threshold_size,  call save_live_iterate()
> Data of VFIO device for pre-copy phase is copied.
> Iterate till pending bytes converge and are less than threshold
> |
> On migration completion, vCPUs stops and calls .save_live_complete_precopy
> for each active device. VFIO device is then transitioned in
>  _SAVING state.
> (FINISH_MIGRATE, _DEVICE, _SAVING)
> For VFIO device, iterate in  .save_live_complete_precopy  until
> pending data is 0.
> (FINISH_MIGRATE, _DEVICE, _STOPPED)

I suggest we also register to VMStateDescription, whose .pre_save
handler would get called after .save_live_complete_precopy in pre-copy
only case, and will called before .save_live_iterate in post-copy
enabled case.
In the .pre_save handler, we can save all device state which must be
copied after device stop in source vm and before device state in target vm.

> |
> (FINISH_MIGRATE, _COMPLETED, STOPPED)
> Migraton thread schedule cleanup bottom half and exit
> 
> Live migration resume path:
> Incomming migration calls .load_setup for each device
> (RESTORE_VM, _ACTIVE, STOPPED)
> |
> For each device, .load_state is called for that device section data
> |
> At the end, called .load_cleanup for each device and vCPUs are started.
> |
> (RUNNING, _NONE, _RUNNING)
> 
> Note that:
> - Migration post copy is not supported.
> 
> v3 -> v4:
> - Added one more bit for _RESUMING flag to be set explicitly.
> - data_offset field is read-only for user space application.
> - data_size is read for every iteration before reading data from migration, 
> that
>   is removed assumption that data will be till end of migration region.
> - If vendor driver supports mappable sparsed region, map those region during
>   setup state of save/load, similarly unmap those from cleanup routines.
> - Handles race condition that causes data corruption in migration region 
> during
>   save device state by adding mutex and serialiaing save_buffer and
>   get_dirty_pages routines.
> - Skip called get_dirty_pages routine for mapped MMIO region of device.
> - Added trace events.
> - Splitted into multiple functional patches.
> 
> v2 -> v3:
> - Removed enum of VFIO device states. Defined VFIO device state with 2 bits.
> - Re-structured vfio_device_migration_info to keep it minimal and defined 
> action
>   on read and write access on its members.
> 
> v1 -> v2:
> - Defined MIGRATION region type and sub-type which should be used with region
>   type capability.
> - Re-structured vfio_device_migration_info. This structure will be placed at 
> 0th
>   offset of migration region.
> - Replaced ioctl with read/write for trapped part of migration region.
> - Added both type of access support, trapped or mmapped, for data section of 
> the
>   region.
> - Moved PCI device

Re: [Qemu-devel] [PATCH v4 08/13] vfio: Add save state functions to SaveVMHandlers

2019-06-20 Thread Yan Zhao

On Thu, Jun 20, 2019 at 10:37:36PM +0800, Kirti Wankhede wrote:
> Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
> functions. These functions handles pre-copy and stop-and-copy phase.
> 
> In _SAVING|_RUNNING device state or pre-copy phase:
> - read pending_bytes
> - read data_offset - indicates kernel driver to write data to staging
>   buffer which is mmapped.
> - read data_size - amount of data in bytes written by vendor driver in 
> migration
>   region.
> - if data section is trapped, pread() number of bytes in data_size, from
>   data_offset.
> - if data section is mmaped, read mmaped buffer of size data_size.
> - Write data packet to file stream as below:
> {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
> VFIO_MIG_FLAG_END_OF_STATE }
> 
> In _SAVING device state or stop-and-copy phase
> a. read config space of device and save to migration file stream. This
>doesn't need to be from vendor driver. Any other special config state
>from driver can be saved as data in following iteration.
> b. read pending_bytes - indicates kernel driver to write data to staging
>buffer which is mmapped.
> c. read data_size - amount of data in bytes written by vendor driver in
>migration region.
> d. if data section is trapped, pread() from data_offset of size data_size.
> e. if data section is mmaped, read mmaped buffer of size data_size.
> f. Write data packet as below:
>{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
> g. iterate through steps b to f until (pending_bytes > 0)
> h. Write {VFIO_MIG_FLAG_END_OF_STATE}
> 
> .save_live_iterate runs outside the iothread lock in the migration case, which
> could race with asynchronous call to get dirty page list causing data 
> corruption
> in mapped migration region. Mutex added here to serial migration buffer read
> operation.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/migration.c | 212 
> 
>  1 file changed, 212 insertions(+)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index fe0887c27664..0a2f30872316 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -107,6 +107,111 @@ static int vfio_migration_set_state(VFIODevice 
> *vbasedev, uint32_t state)
>  return 0;
>  }
>  
> +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +VFIORegion *region = >region.buffer;
> +uint64_t data_offset = 0, data_size = 0;
> +int ret;
> +
> +ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
> +region->fd_offset + offsetof(struct 
> vfio_device_migration_info,
> + data_offset));
> +if (ret != sizeof(data_offset)) {
> +error_report("Failed to get migration buffer data offset %d",
> + ret);
> +return -EINVAL;
> +}
> +
> +ret = pread(vbasedev->fd, _size, sizeof(data_size),
> +region->fd_offset + offsetof(struct 
> vfio_device_migration_info,
> + data_size));
> +if (ret != sizeof(data_size)) {
> +error_report("Failed to get migration buffer data size %d",
> + ret);
> +return -EINVAL;
> +}
> +
how big is the data_size ? 
if this size is too big, it may take too much time and block others.

> +if (data_size > 0) {
> +void *buf = NULL;
> +bool buffer_mmaped = false;
> +
> +if (region->mmaps) {
> +int i;
> +
> +for (i = 0; i < region->nr_mmaps; i++) {
> +if ((data_offset >= region->mmaps[i].offset) &&
> +(data_offset < region->mmaps[i].offset +
> +   region->mmaps[i].size)) {
> +buf = region->mmaps[i].mmap + (data_offset -
> +   region->mmaps[i].offset);
> +buffer_mmaped = true;
> +break;
> +}
> +}
> +}
> +
> +if (!buffer_mmaped) {
> +buf = g_malloc0(data_size);
> +ret = pread(vbasedev->fd, buf, data_size,
> +region->fd_offset + data_offset);
> +if (ret != data_size) {
> +error_report("Failed to get migration data %d", ret);
> +g_free(buf);
> +return -EINVAL;
> +}
> +}
> +
> +qemu_put_be64(f, data_size);
> +qemu_put_buffer(f, buf, data_size);
> +
> +if (!buffer_mmaped) {
> +g_free(buf);
> +}
> +migration->pending_bytes -= data_size;
> +} else {
> +qemu_put_be64(f, data_size);
> +}
> +
> +ret = qemu_file_get_error(f);
> +
> +return data_size;
> +}
> +
> +static int vfio_update_pending(VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration =

Re: [Qemu-devel] [PATCH] pc: fix possible NULL pointer dereference in pc_machine_get_device_memory_region_size()

2019-06-20 Thread Paolo Bonzini

On 10/06/19 15:50, Igor Mammedov wrote:
> QEMU will crash when device-memory-region-size property is read if 
> ms->device_memory
> wasn't initialized yet.
> 
> Crash can be reproduced with:
>  $QEMU -preconfig -qmp unix:qmp_socket,server,nowait &
>  ./scripts/qmp/qom-get -s qmp_socket /machine.device-memory-region-size
> 
> Instead of crashing return 0 if ms->device_memory hasn't been initialized.

This patch breaks bios-tables-test /x86_64/acpi/piix64/cpuhp:

acpi-test: Warning! SRAT binary file mismatch. Actual [aml:/tmp/aml-RIFK3Z], 
Expected [aml:tests/data/acpi/pc/SRAT.memhp].
acpi-test: Warning! SRAT mismatch. Actual [asl:/tmp/asl-TLFK3Z.dsl, 
aml:/tmp/aml-RIFK3Z], Expected [asl:/tmp/asl-JL5J3Z.dsl, 
aml:tests/data/acpi/pc/SRAT.memhp].
**
ERROR:/home/pbonzini/work/upstream/qemu/tests/bios-tables-test.c:434:test_acpi_asl:
 assertion failed: (all_tables_match)
ERROR - Bail out! 
ERROR:/home/pbonzini/work/upstream/qemu/tests/bios-tables-test.c:434:test_acpi_asl:
 assertion failed: (all_tables_match)

So I'm removing it from the pull request.

Paolo

> Signed-off-by: Igor Mammedov 
> ---
> v2:
>   add reproducer to commit message
>(Markus Armbruster )
> 
>  hw/i386/pc.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index edc240b..1b7ead9 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -2459,7 +2459,11 @@ pc_machine_get_device_memory_region_size(Object *obj, 
> Visitor *v,
>   Error **errp)
>  {
>  MachineState *ms = MACHINE(obj);
> -int64_t value = memory_region_size(>device_memory->mr);
> +int64_t value = 0;
> +
> +if (ms->device_memory) {
> +memory_region_size(>device_memory->mr);
> +}
>  
>  visit_type_int(v, name, , errp);
>  }
>

Re: [Qemu-devel] [PATCH v4 03/13] vfio: Add save and load functions for VFIO PCI devices

2019-06-20 Thread Yan Zhao

On Thu, Jun 20, 2019 at 10:37:31PM +0800, Kirti Wankhede wrote:
> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/pci.c | 112 
> ++
>  hw/vfio/pci.h |  29 +++
>  2 files changed, 141 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index ce3fe96efe2c..09a0821a5b1c 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1187,6 +1187,118 @@ void vfio_pci_write_config(PCIDevice *pdev,
>  }
>  }
>  
> +void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +PCIDevice *pdev = >pdev;
> +uint16_t pci_cmd;
> +int i;
> +
> +for (i = 0; i < PCI_ROM_SLOT; i++) {
> +uint32_t bar;
> +
> +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +qemu_put_be32(f, bar);
> +}
> +
> +qemu_put_be32(f, vdev->interrupt);
> +if (vdev->interrupt == VFIO_INT_MSI) {
> +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +bool msi_64bit;
> +
> +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
> PCI_MSI_FLAGS,
> +2);
> +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +msi_addr_lo = pci_default_read_config(pdev,
> + pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
> 4);
> +qemu_put_be32(f, msi_addr_lo);
> +
> +if (msi_64bit) {
> +msi_addr_hi = pci_default_read_config(pdev,
> + pdev->msi_cap + 
> PCI_MSI_ADDRESS_HI,
> + 4);
> +}
> +qemu_put_be32(f, msi_addr_hi);
> +
> +msi_data = pci_default_read_config(pdev,
> +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> PCI_MSI_DATA_32),
> +2);
> +qemu_put_be32(f, msi_data);
> +} else if (vdev->interrupt == VFIO_INT_MSIX) {
> +uint16_t offset;
> +
> +/* save enable bit and maskall bit */
> +offset = pci_default_read_config(pdev,
> +   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 
> 2);
> +qemu_put_be16(f, offset);
> +msix_save(pdev, f);
> +}
> +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +qemu_put_be16(f, pci_cmd);
> +}
> +
> +void vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +PCIDevice *pdev = >pdev;
> +uint32_t interrupt_type;
> +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +uint16_t pci_cmd;
> +bool msi_64bit;
> +int i;
> +
> +/* retore pci bar configuration */
> +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +vfio_pci_write_config(pdev, PCI_COMMAND,
> +pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 
> 2);
> +for (i = 0; i < PCI_ROM_SLOT; i++) {
> +uint32_t bar = qemu_get_be32(f);
> +
> +vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> +}
> +vfio_pci_write_config(pdev, PCI_COMMAND,
> +  pci_cmd | PCI_COMMAND_IO | PCI_COMMAND_MEMORY, 2);
> +
> +interrupt_type = qemu_get_be32(f);
> +
> +if (interrupt_type == VFIO_INT_MSI) {
> +/* restore msi configuration */
> +msi_flags = pci_default_read_config(pdev,
> +pdev->msi_cap + PCI_MSI_FLAGS, 
> 2);
> +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +  msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> +
> +msi_addr_lo = qemu_get_be32(f);
> +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> +  msi_addr_lo, 4);
> +
> +msi_addr_hi = qemu_get_be32(f);
> +if (msi_64bit) {
> +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +  msi_addr_hi, 4);
> +}
> +msi_data = qemu_get_be32(f);
> +vfio_pci_write_config(pdev,
> +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> PCI_MSI_DATA_32),
> +msi_data, 2);
> +
> +vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +  msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> +} else if (interrupt_type == VFIO_INT_MSIX) {
> +uint16_t offset = qemu_get_be16(f);
> +
> +/* load enable bit and maskall bit */
> +vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> +  offset, 2);
> +

[Qemu-devel] [RFC v2 PATCH] hw/arm/virt: makes virt a default machine type

2019-06-20 Thread Wainer dos Santos Moschetta

Peter, Cleber,

Re-sending this email because I forgot to copy the mailing listing.
Sorry. Original message below.

==
Hi Peter et al.,

I came across this when running the acceptance tests in an aarch64 host.
The arch-independent tests fail because, in general, they don't set a
machine type. In order to avoid treating arm targets as special cases
on avocado_qemu framework I prefered to attempt to promote virt as
default for ARM emulation. Moreover since it represents a generic hardware
and its used is broaden advised [1], I found it the right choice.

Maybe that topic was discussed already but I didn't find any reference on
either the mailing list and git logs. It is also true
that I am an ignorant on ARM platform, thus I might be missing
something. Those explain why I send this patch as RFC.

[1] https://wiki.qemu.org/Documentation/Platforms/ARM

Thanks,

Wainer
-- >8 --
Currently none arm target has a default machine type, unlike
others as x86_64 and ppc64. The 'virt' (alias) type represents a
generic ARM hardware, so let's specify it the default machine for
ARM emulation.

Signed-off-by: Wainer dos Santos Moschetta 
---
 hw/arm/virt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 431e2900fd..2f8aa2bfb7 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -74,6 +74,7 @@
 mc->desc = "QEMU " # major "." # minor " ARM Virtual Machine"; \
 if (latest) { \
 mc->alias = "virt"; \
+mc->is_default = 1; \
 } \
 } \
 static const TypeInfo machvirt_##major##_##minor##_info = { \
-- 
2.18.1

Re: [Qemu-devel] [PATCH 0/2] i386: Introduce X86CPUCacheCPUID struct

2019-06-20 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20190620203616.22715-1-ehabk...@redhat.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  x86_64-softmmu/target/i386/svm_helper.o
  CC  x86_64-softmmu/target/i386/machine.o
  CC  x86_64-softmmu/target/i386/arch_memory_mapping.o
/tmp/qemu-test/src/target/i386/cpu.c:1484:15: error: initializer element is not 
a compile-time constant
.cpuid2 = epyc_cache_info,
  ^~~
1 error generated.


The full log is available at
http://patchew.org/logs/20190620203616.22715-1-ehabk...@redhat.com/testing.asan/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[Qemu-devel] [PATCH 0/2] i386: Introduce X86CPUCacheCPUID struct

2019-06-20 Thread Eduardo Habkost

The new struct will let us declare the existing legacy CPU
topology info in a static constant, instead of defining it inside
x86_cpu_realizefn().  While doing it, make the CPU cache
declarations all constants.

This will help us represent the model-specific cache info as QOM
properties in the future.  Currently X86CPUDefinition::cache_info
(which is being renamed to X86CPUDefinition::cache_cpuid) is the
only CPU model field that can't be represented as a QOM property
value.

Eduardo Habkost (2):
  i386: make cache structs const-safe
  i386: Introduce X86CPUCacheCPUID struct

 target/i386/cpu.h |  15 +++---
 target/i386/cpu.c | 134 +++---
 2 files changed, 84 insertions(+), 65 deletions(-)

-- 
2.18.0.rc1.1.g3f1ff2140

[Qemu-devel] [PATCH 1/2] i386: make cache structs const-safe

2019-06-20 Thread Eduardo Habkost

This code will be refactored and it will be useful to make it
const-safe to catch mistakes.

Signed-off-by: Eduardo Habkost 
---
 target/i386/cpu.h |  8 
 target/i386/cpu.c | 36 ++--
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 0732e059ec..2f03489bf0 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1109,10 +1109,10 @@ typedef struct CPUCacheInfo {
 
 
 typedef struct CPUCaches {
-CPUCacheInfo *l1d_cache;
-CPUCacheInfo *l1i_cache;
-CPUCacheInfo *l2_cache;
-CPUCacheInfo *l3_cache;
+const CPUCacheInfo *l1d_cache;
+const CPUCacheInfo *l1i_cache;
+const CPUCacheInfo *l2_cache;
+const CPUCacheInfo *l3_cache;
 } CPUCaches;
 
 typedef struct CPUX86State {
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index fbed2eb804..a6acd71911 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -205,7 +205,7 @@ struct CPUID2CacheDescriptorInfo cpuid2_cache_descriptors[] 
= {
  * Return a CPUID 2 cache descriptor for a given cache.
  * If no known descriptor is found, return CACHE_DESCRIPTOR_UNAVAILABLE
  */
-static uint8_t cpuid2_cache_descriptor(CPUCacheInfo *cache)
+static uint8_t cpuid2_cache_descriptor(const CPUCacheInfo *cache)
 {
 int i;
 
@@ -249,7 +249,7 @@ static uint8_t cpuid2_cache_descriptor(CPUCacheInfo *cache)
 
 
 /* Encode cache info for CPUID[4] */
-static void encode_cache_cpuid4(CPUCacheInfo *cache,
+static void encode_cache_cpuid4(const CPUCacheInfo *cache,
 int num_apic_ids, int num_cores,
 uint32_t *eax, uint32_t *ebx,
 uint32_t *ecx, uint32_t *edx)
@@ -282,7 +282,7 @@ static void encode_cache_cpuid4(CPUCacheInfo *cache,
 }
 
 /* Encode cache info for CPUID[0x8005].ECX or CPUID[0x8005].EDX */
-static uint32_t encode_cache_cpuid8005(CPUCacheInfo *cache)
+static uint32_t encode_cache_cpuid8005(const CPUCacheInfo *cache)
 {
 assert(cache->size % 1024 == 0);
 assert(cache->lines_per_tag > 0);
@@ -312,8 +312,8 @@ static uint32_t encode_cache_cpuid8005(CPUCacheInfo 
*cache)
  * Encode cache info for CPUID[0x8006].ECX and CPUID[0x8006].EDX
  * @l3 can be NULL.
  */
-static void encode_cache_cpuid8006(CPUCacheInfo *l2,
-   CPUCacheInfo *l3,
+static void encode_cache_cpuid8006(const CPUCacheInfo *l2,
+   const CPUCacheInfo *l3,
uint32_t *ecx, uint32_t *edx)
 {
 assert(l2->size % 1024 == 0);
@@ -394,9 +394,9 @@ static int cores_in_core_complex(int nr_cores)
 }
 
 /* Encode cache info for CPUID[801D] */
-static void encode_cache_cpuid801d(CPUCacheInfo *cache, CPUState *cs,
-uint32_t *eax, uint32_t *ebx,
-uint32_t *ecx, uint32_t *edx)
+static void encode_cache_cpuid801d(const CPUCacheInfo *cache, CPUState *cs,
+   uint32_t *eax, uint32_t *ebx,
+   uint32_t *ecx, uint32_t *edx)
 {
 uint32_t l3_cores;
 assert(cache->size == cache->line_size * cache->associativity *
@@ -541,7 +541,7 @@ static void encode_topo_cpuid801e(CPUState *cs, X86CPU 
*cpu,
  */
 
 /* L1 data cache: */
-static CPUCacheInfo legacy_l1d_cache = {
+static const CPUCacheInfo legacy_l1d_cache = {
 .type = DATA_CACHE,
 .level = 1,
 .size = 32 * KiB,
@@ -554,7 +554,7 @@ static CPUCacheInfo legacy_l1d_cache = {
 };
 
 /*FIXME: CPUID leaf 0x8005 is inconsistent with leaves 2 & 4 */
-static CPUCacheInfo legacy_l1d_cache_amd = {
+static const CPUCacheInfo legacy_l1d_cache_amd = {
 .type = DATA_CACHE,
 .level = 1,
 .size = 64 * KiB,
@@ -568,7 +568,7 @@ static CPUCacheInfo legacy_l1d_cache_amd = {
 };
 
 /* L1 instruction cache: */
-static CPUCacheInfo legacy_l1i_cache = {
+static const CPUCacheInfo legacy_l1i_cache = {
 .type = INSTRUCTION_CACHE,
 .level = 1,
 .size = 32 * KiB,
@@ -581,7 +581,7 @@ static CPUCacheInfo legacy_l1i_cache = {
 };
 
 /*FIXME: CPUID leaf 0x8005 is inconsistent with leaves 2 & 4 */
-static CPUCacheInfo legacy_l1i_cache_amd = {
+static const CPUCacheInfo legacy_l1i_cache_amd = {
 .type = INSTRUCTION_CACHE,
 .level = 1,
 .size = 64 * KiB,
@@ -595,7 +595,7 @@ static CPUCacheInfo legacy_l1i_cache_amd = {
 };
 
 /* Level 2 unified cache: */
-static CPUCacheInfo legacy_l2_cache = {
+static const CPUCacheInfo legacy_l2_cache = {
 .type = UNIFIED_CACHE,
 .level = 2,
 .size = 4 * MiB,
@@ -608,7 +608,7 @@ static CPUCacheInfo legacy_l2_cache = {
 };
 
 /*FIXME: CPUID leaf 2 descriptor is inconsistent with CPUID leaf 4 */
-static CPUCacheInfo legacy_l2_cache_cpuid2 = {
+static const CPUCacheInfo legacy_l2_cache_cpuid2 = {
 .type = UNIFIED_CACHE,
 .level = 2,
 .size = 2 * MiB,
@@ -618,7

[Qemu-devel] [PATCH 2/2] i386: Introduce X86CPUCacheCPUID struct

2019-06-20 Thread Eduardo Habkost

The new struct will be used to simplify the code that deals with
legacy cache information.

Signed-off-by: Eduardo Habkost 
---
 target/i386/cpu.h |   7 +---
 target/i386/cpu.c | 100 --
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 2f03489bf0..86cf04d441 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1306,11 +1306,8 @@ typedef struct CPUX86State {
 /* Features that were explicitly enabled/disabled */
 FeatureWordArray user_features;
 uint32_t cpuid_model[12];
-/* Cache information for CPUID.  When legacy-cache=on, the cache data
- * on each CPUID leaf will be different, because we keep compatibility
- * with old QEMU versions.
- */
-CPUCaches cache_info_cpuid2, cache_info_cpuid4, cache_info_amd;
+/* Cache information for CPUID */
+const struct X86CPUCacheCPUID *caches;
 
 /* MTRRs */
 uint64_t mtrr_fixed[11];
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index a6acd71911..e9f301f9ea 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1412,6 +1412,10 @@ static char *x86_cpu_class_get_model_name(X86CPUClass 
*cc)
  strlen(class_name) - strlen(X86_CPU_TYPE_SUFFIX));
 }
 
+typedef struct X86CPUCacheCPUID {
+CPUCaches cpuid2, cpuid4, amd;
+} X86CPUCacheCPUID;
+
 struct X86CPUDefinition {
 const char *name;
 uint32_t level;
@@ -1423,7 +1427,7 @@ struct X86CPUDefinition {
 int stepping;
 FeatureWordArray features;
 const char *model_id;
-const CPUCaches *cache_info;
+const X86CPUCacheCPUID *cache_cpuid;
 };
 
 static const CPUCaches epyc_cache_info = {
@@ -1476,6 +1480,39 @@ static const CPUCaches epyc_cache_info = {
 },
 };
 
+static X86CPUCacheCPUID epyc_cache_cpuid = {
+.cpuid2 = epyc_cache_info,
+.cpuid4 = epyc_cache_info,
+.amd = epyc_cache_info,
+};
+
+/*
+ * Legacy cache template.  When legacy-cache=on, the cache data
+ * on each CPUID leaf will be different, because we keep compatibility
+ * with old QEMU versions.
+ */
+static X86CPUCacheCPUID legacy_cache_cpuid = {
+.cpuid2 = {
+.l1d_cache = _l1d_cache,
+.l1i_cache = _l1i_cache,
+.l2_cache = _l2_cache_cpuid2,
+.l3_cache = _l3_cache,
+},
+.cpuid4 = {
+.l1d_cache = _l1d_cache,
+.l1i_cache = _l1i_cache,
+.l2_cache = _l2_cache,
+.l3_cache = _l3_cache,
+},
+.amd = {
+.l1d_cache = _l1d_cache_amd,
+.l1i_cache = _l1i_cache_amd,
+.l2_cache = _l2_cache_amd,
+.l3_cache = _l3_cache,
+},
+};
+
+
 static X86CPUDefinition builtin_x86_defs[] = {
 {
 .name = "qemu64",
@@ -2886,7 +2923,7 @@ static X86CPUDefinition builtin_x86_defs[] = {
 CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE,
 .xlevel = 0x801E,
 .model_id = "AMD EPYC Processor",
-.cache_info = _cache_info,
+.cache_cpuid = _cache_cpuid,
 },
 {
 .name = "EPYC-IBPB",
@@ -2936,7 +2973,7 @@ static X86CPUDefinition builtin_x86_defs[] = {
 CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE,
 .xlevel = 0x801E,
 .model_id = "AMD EPYC Processor (with IBPB)",
-.cache_info = _cache_info,
+.cache_cpuid = _cache_cpuid,
 },
 {
 .name = "Dhyana",
@@ -2986,7 +3023,7 @@ static X86CPUDefinition builtin_x86_defs[] = {
 CPUID_SVM_NPT | CPUID_SVM_NRIPSAVE,
 .xlevel = 0x801E,
 .model_id = "Hygon Dhyana Processor",
-.cache_info = _cache_info,
+.cache_cpuid = _cache_cpuid,
 },
 };
 
@@ -3951,7 +3988,7 @@ static void x86_cpu_load_def(X86CPU *cpu, 
X86CPUDefinition *def, Error **errp)
 }
 
 /* legacy-cache defaults to 'off' if CPU model provides cache info */
-cpu->legacy_cache = !def->cache_info;
+cpu->legacy_cache = !def->cache_cpuid;
 
 /* Special cases not set in the X86CPUDefinition structs: */
 /* TODO: in-kernel irqchip for hvf */
@@ -4301,11 +4338,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 if (!cpu->enable_l3_cache) {
 *ecx = 0;
 } else {
-*ecx = cpuid2_cache_descriptor(env->cache_info_cpuid2.l3_cache);
+*ecx = cpuid2_cache_descriptor(env->caches->cpuid2.l3_cache);
 }
-*edx = (cpuid2_cache_descriptor(env->cache_info_cpuid2.l1d_cache) << 
16) |
-   (cpuid2_cache_descriptor(env->cache_info_cpuid2.l1i_cache) <<  
8) |
-   (cpuid2_cache_descriptor(env->cache_info_cpuid2.l2_cache));
+*edx = (cpuid2_cache_descriptor(env->caches->cpuid2.l1d_cache) << 16) |
+   (cpuid2_cache_descriptor(env->caches->cpuid2.l1i_cache) <<  8) |
+   (cpuid2_cache_descriptor(env->caches->cpuid2.l2_cache));
 break;
 case 4:
 /* cache info: needed for Core compatibility */
@@ -4320,24 +4357,24 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t

Re: [Qemu-devel] [PATCH v11] ssh: switch from libssh2 to libssh

2019-06-20 Thread Philippe Mathieu-Daudé

On 6/20/19 10:08 PM, Pino Toscano wrote:
> Rewrite the implementation of the ssh block driver to use libssh instead
> of libssh2.  The libssh library has various advantages over libssh2:
> - easier API for authentication (for example for using ssh-agent)
> - easier API for known_hosts handling
> - supports newer types of keys in known_hosts
> 
> Use APIs/features available in libssh 0.8 conditionally, to support
> older versions (which are not recommended though).
> 
> Adjust the iotest 207 according to the different error message, and to
> find the default key type for localhost (to properly compare the
> fingerprint with).
> Contributed-by: Max Reitz 
> 
> Adjust the various Docker/Travis scripts to use libssh when available
> instead of libssh2. The mingw/mxe testing is dropped for now, as there
> are no packages for it.
> 
> Signed-off-by: Pino Toscano 
> Tested-by: Philippe Mathieu-Daudé 
> Acked-by: Alex Bennée 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
> 
> Changes from v10:
> - improve error message for key mismatch
> - integrate Max Reitz' fix to iotest 207 to detect the key type used by
>   localhost
> 
> Changes from v9:
> - restored "default" case in the server status switch for libssh < 0.8.0
> - print the host key type & fingerprint on mismatch with known_hosts
> - improve/fix message for failed socket_set_nodelay()
> - reset s->sock properly
> 
> Changes from v8:
> - use a newer key type in iotest 207
> - improve the commit message
> 
> Changes from v7:
> - #if HAVE_LIBSSH_0_8 -> #ifdef HAVE_LIBSSH_0_8
> - ptrdiff_t -> size_t
> 
> Changes from v6:
> - fixed few checkpatch style issues
> - detect libssh 0.8 via symbol detection
> - adjust travis/docker test material
> - remove dead "default" case in a switch
> - use variables for storing MIN() results
> - adapt a documentation bit
> 
> Changes from v5:
> - adapt to newer tracing APIs
> - disable ssh compression (mimic what libssh2 does by default)
> - use build time checks for libssh 0.8, and use newer APIs directly
> 
> Changes from v4:
> - fix wrong usages of error_setg/session_error_setg/sftp_error_setg
> - fix few return code checks
> - remove now-unused parameters in few internal functions
> - allow authentication with "none" method
> - switch to unsigned int for the port number
> - enable TCP_NODELAY on the socket
> - fix one reference error message in iotest 207
> 
> Changes from v3:
> - fix socket cleanup in connect_to_ssh()
> - add comments about the socket cleanup
> - improve the error reporting (closer to what was with libssh2)
> - improve EOF detection on sftp_read()
> 
> Changes from v2:
> - used again an own fd
> - fixed co_yield() implementation
> 
> Changes from v1:
> - fixed jumbo packets writing
> - fixed missing 'err' assignment
> - fixed commit message
> 
>  .travis.yml   |   4 +-
>  block/Makefile.objs   |   6 +-
>  block/ssh.c   | 669 ++
>  block/trace-events|  14 +-
>  configure |  65 +-
>  docs/qemu-block-drivers.texi  |   2 +-
>  .../dockerfiles/debian-win32-cross.docker |   1 -
>  .../dockerfiles/debian-win64-cross.docker |   1 -
>  tests/docker/dockerfiles/fedora.docker|   4 +-
>  tests/docker/dockerfiles/ubuntu.docker|   2 +-
>  tests/docker/dockerfiles/ubuntu1804.docker|   2 +-
>  tests/qemu-iotests/207|  54 +-
>  tests/qemu-iotests/207.out|   2 +-
>  13 files changed, 468 insertions(+), 358 deletions(-)
> 
> diff --git a/.travis.yml b/.travis.yml
> index aeb9b211cd..279658b116 100644
> --- a/.travis.yml
> +++ b/.travis.yml
> @@ -31,7 +31,7 @@ addons:
>- libseccomp-dev
>- libspice-protocol-dev
>- libspice-server-dev
> -  - libssh2-1-dev
> +  - libssh-dev
>- liburcu-dev
>- libusb-1.0-0-dev
>- libvte-2.91-dev
> @@ -270,7 +270,7 @@ matrix:
>  - libseccomp-dev
>  - libspice-protocol-dev
>  - libspice-server-dev
> -- libssh2-1-dev
> +- libssh-dev
>  - liburcu-dev
>  - libusb-1.0-0-dev
>  - libvte-2.91-dev
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index dbd1522722..35f3bca4d9 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -31,7 +31,7 @@ block-obj-$(CONFIG_CURL) += curl.o
>  block-obj-$(CONFIG_RBD) += rbd.o
>  block-obj-$(CONFIG_GLUSTERFS) += gluster.o
>  block-obj-$(CONFIG_VXHS) += vxhs.o
> -block-obj-$(CONFIG_LIBSSH2) += ssh.o
> +block-obj-$(CONFIG_LIBSSH) += ssh.o
>  block-obj-y += accounting.o dirty-bitmap.o
>  block-obj-y += write-threshold.o
>  block-obj-y += backup.o
> @@ -52,8 +52,8 @@ rbd.o-libs := $(RBD_LIBS)
>  gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
>  gluster.o-libs := $(GLUSTERFS_LIBS)
>  vxhs.o-libs:= $(VXHS_LIBS)
> -ssh.o-cflags   :=

Re: [Qemu-devel] [PATCH v11] ssh: switch from libssh2 to libssh

2019-06-20 Thread Max Reitz

On 20.06.19 22:08, Pino Toscano wrote:
> Rewrite the implementation of the ssh block driver to use libssh instead
> of libssh2.  The libssh library has various advantages over libssh2:
> - easier API for authentication (for example for using ssh-agent)
> - easier API for known_hosts handling
> - supports newer types of keys in known_hosts
> 
> Use APIs/features available in libssh 0.8 conditionally, to support
> older versions (which are not recommended though).
> 
> Adjust the iotest 207 according to the different error message, and to
> find the default key type for localhost (to properly compare the
> fingerprint with).
> Contributed-by: Max Reitz 
> 
> Adjust the various Docker/Travis scripts to use libssh when available
> instead of libssh2. The mingw/mxe testing is dropped for now, as there
> are no packages for it.
> 
> Signed-off-by: Pino Toscano 
> Tested-by: Philippe Mathieu-Daudé 
> Acked-by: Alex Bennée 
> ---
> 
> Changes from v10:
> - improve error message for key mismatch
> - integrate Max Reitz' fix to iotest 207 to detect the key type used by
>   localhost
> 
> Changes from v9:
> - restored "default" case in the server status switch for libssh < 0.8.0
> - print the host key type & fingerprint on mismatch with known_hosts
> - improve/fix message for failed socket_set_nodelay()
> - reset s->sock properly
> 
> Changes from v8:
> - use a newer key type in iotest 207
> - improve the commit message
> 
> Changes from v7:
> - #if HAVE_LIBSSH_0_8 -> #ifdef HAVE_LIBSSH_0_8
> - ptrdiff_t -> size_t
> 
> Changes from v6:
> - fixed few checkpatch style issues
> - detect libssh 0.8 via symbol detection
> - adjust travis/docker test material
> - remove dead "default" case in a switch
> - use variables for storing MIN() results
> - adapt a documentation bit
> 
> Changes from v5:
> - adapt to newer tracing APIs
> - disable ssh compression (mimic what libssh2 does by default)
> - use build time checks for libssh 0.8, and use newer APIs directly
> 
> Changes from v4:
> - fix wrong usages of error_setg/session_error_setg/sftp_error_setg
> - fix few return code checks
> - remove now-unused parameters in few internal functions
> - allow authentication with "none" method
> - switch to unsigned int for the port number
> - enable TCP_NODELAY on the socket
> - fix one reference error message in iotest 207
> 
> Changes from v3:
> - fix socket cleanup in connect_to_ssh()
> - add comments about the socket cleanup
> - improve the error reporting (closer to what was with libssh2)
> - improve EOF detection on sftp_read()
> 
> Changes from v2:
> - used again an own fd
> - fixed co_yield() implementation
> 
> Changes from v1:
> - fixed jumbo packets writing
> - fixed missing 'err' assignment
> - fixed commit message
> 
>  .travis.yml   |   4 +-
>  block/Makefile.objs   |   6 +-
>  block/ssh.c   | 669 ++
>  block/trace-events|  14 +-
>  configure |  65 +-
>  docs/qemu-block-drivers.texi  |   2 +-
>  .../dockerfiles/debian-win32-cross.docker |   1 -
>  .../dockerfiles/debian-win64-cross.docker |   1 -
>  tests/docker/dockerfiles/fedora.docker|   4 +-
>  tests/docker/dockerfiles/ubuntu.docker|   2 +-
>  tests/docker/dockerfiles/ubuntu1804.docker|   2 +-
>  tests/qemu-iotests/207|  54 +-
>  tests/qemu-iotests/207.out|   2 +-
>  13 files changed, 468 insertions(+), 358 deletions(-)

Thanks, applied to my block branch:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 11/12] iotests: add test 257 for bitmap-mode backups

2019-06-20 Thread John Snow




On 6/20/19 3:48 PM, Max Reitz wrote:
> On 20.06.19 21:08, John Snow wrote:
>>
>>
>> On 6/20/19 2:35 PM, Max Reitz wrote:
>>> On 20.06.19 03:03, John Snow wrote:
 Signed-off-by: John Snow 
 ---
  tests/qemu-iotests/257 |  412 +++
  tests/qemu-iotests/257.out | 2199 
  tests/qemu-iotests/group   |1 +
  3 files changed, 2612 insertions(+)
  create mode 100755 tests/qemu-iotests/257
  create mode 100644 tests/qemu-iotests/257.out
>>>
>>> This test is actually quite nicely written.
>>>
>>
>> Thanks!
>>
>>> I like that I don’t have to read the reference output but can just grep
>>> for “error”.
>>>
>>
>> Me too!! Actually, doing the math for what to expect and verifying the
>> output by hand was becoming a major burden, so partially this test
>> infrastructure was my attempt to procedurally verify that the results I
>> was seeing were what made sense.
>>
>> At the end of it, I felt it was nice to keep it in there.
>>
>>> Only minor notes below.
>>>
 diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257
 new file mode 100755
 index 00..5f7f388504
 --- /dev/null
 +++ b/tests/qemu-iotests/257
>>>
>>> [...]
>>>
 +class PatternGroup:
 +"""Grouping of Pattern objects. Initialize with an iterable of 
 Patterns."""
 +def __init__(self, patterns):
 +self.patterns = patterns
 +
 +def bits(self, granularity):
 +"""Calculate the unique bits dirtied by this pattern grouping"""
 +res = set()
 +for pattern in self.patterns:
 +lower = math.floor(pattern.offset / granularity)
 +upper = math.floor((pattern.offset + pattern.size - 1) / 
 granularity)
 +res = res | set(range(lower, upper + 1))
>>>
>>> Why you’d do floor((x - 1) / y) + 1 has confused me quite a while.
>>> Until I realized that oh yeah, Python’s range() is a right-open
>>> interval.  I don’t like Python’s range().
>>>
>>
>> It confuses me constantly, but it's really meant to be used for
>> iterating over lengths.
> 
> I can see the use for range(x), but not for range(a, b).
> 
> (At least it’s not Rust, where [a..b] is [a, b), too – it’s enclosed in
> square brackets, it should be closed, damnit.)
> 
>> This is somewhat of an abuse of it. I always
>> test it out in a console first before using it, just in case.
>>
>>> (Yes, you’re right, this is better to read than just ceil(x / y).
>>> Because it reminds people like me that range() is weird.)
>>>
 +return res
 +
 +GROUPS = [
 +PatternGroup([
 +# Batch 0: 4 clusters
 +mkpattern('0x49', 0x000),
 +mkpattern('0x6c', 0x010),   # 1M
 +mkpattern('0x6f', 0x200),   # 32M
 +mkpattern('0x76', 0x3ff)]), # 64M - 64K
 +PatternGroup([
 +# Batch 1: 6 clusters (3 new)
 +mkpattern('0x65', 0x000),   # Full overwrite
 +mkpattern('0x77', 0x00f8000),   # Partial-left (1M-32K)
 +mkpattern('0x72', 0x2008000),   # Partial-right (32M+32K)
 +mkpattern('0x69', 0x3fe)]), # Adjacent-left (64M - 128K)
 +PatternGroup([
 +# Batch 2: 7 clusters (3 new)
 +mkpattern('0x74', 0x001),   # Adjacent-right
 +mkpattern('0x69', 0x00e8000),   # Partial-left  (1M-96K)
 +mkpattern('0x6e', 0x2018000),   # Partial-right (32M+96K)
 +mkpattern('0x67', 0x3fe,
 +  2*GRANULARITY)]), # Overwrite [(64M-128K)-64M)
 +PatternGroup([
 +# Batch 3: 8 clusters (5 new)
 +# Carefully chosen such that nothing re-dirties the one cluster
 +# that copies out successfully before failure in Group #1.
 +mkpattern('0xaa', 0x001,
 +  3*GRANULARITY),   # Overwrite and 2x Adjacent-right
 +mkpattern('0xbb', 0x00d8000),   # Partial-left (1M-160K)
 +mkpattern('0xcc', 0x2028000),   # Partial-right (32M+160K)
 +mkpattern('0xdd', 0x3fc)]), # New; leaving a gap to the right
 +]
>>>
>>> I’d place this four spaces to the left.  But maybe placing it here is
>>> proper Python indentation, while moving it to the left would be C
>>> indentation.
>>>
>>
>> Either is fine, I think. In this case it affords us more room for the
>> commentary on the bit ranges. Maybe it's not necessary, but at least
>> personally I get woozy looking at the bit patterns.
> 
> Oh, no, no, I just meant the final closing ”]” of GROUPS.
> 
> (I did wonder about why you didn’t place every PatternGroups closing ])
> on a separate line, too, but I decided not to say anything, because it
> looks Python-y this way.  But you’re right, this gives a nice excuse why
> to put more space between the patterns and the comments, which helps.)
> 
 +class Drive:
 +"""Represents,

Re: [Qemu-devel] [PATCH v10] ssh: switch from libssh2 to libssh

2019-06-20 Thread Pino Toscano

On Thursday, 20 June 2019 14:58:40 CEST Max Reitz wrote:
> On 20.06.19 11:49, Pino Toscano wrote:
> > On Tuesday, 18 June 2019 15:14:30 CEST Max Reitz wrote:
> >> On 18.06.19 11:24, Pino Toscano wrote:
> >>> Rewrite the implementation of the ssh block driver to use libssh instead
> >>> of libssh2.  The libssh library has various advantages over libssh2:
> >>> - easier API for authentication (for example for using ssh-agent)
> >>> - easier API for known_hosts handling
> >>> - supports newer types of keys in known_hosts
> >>>
> >>> Use APIs/features available in libssh 0.8 conditionally, to support
> >>> older versions (which are not recommended though).
> >>>
> >>> Adjust the tests according to the different error message, and to the
> >>> newer host keys (ed25519) that are used by default with OpenSSH >= 6.7
> >>> and libssh >= 0.7.0.
> >>>
> >>> Adjust the various Docker/Travis scripts to use libssh when available
> >>> instead of libssh2. The mingw/mxe testing is dropped for now, as there
> >>> are no packages for it.
> >>>
> >>> Signed-off-by: Pino Toscano 
> >>> Tested-by: Philippe Mathieu-Daudé 
> >>> Acked-by: Alex Bennée 
> >>> ---
> >>>
> >>> Changes from v9:
> >>> - restored "default" case in the server status switch for libssh < 0.8.0
> >>> - print the host key type & fingerprint on mismatch with known_hosts
> >>> - improve/fix message for failed socket_set_nodelay()
> >>> - reset s->sock properly
> >>>
> >>> Changes from v8:
> >>> - use a newer key type in iotest 207
> >>> - improve the commit message
> >>>
> >>> Changes from v7:
> >>> - #if HAVE_LIBSSH_0_8 -> #ifdef HAVE_LIBSSH_0_8
> >>> - ptrdiff_t -> size_t
> >>>
> >>> Changes from v6:
> >>> - fixed few checkpatch style issues
> >>> - detect libssh 0.8 via symbol detection
> >>> - adjust travis/docker test material
> >>> - remove dead "default" case in a switch
> >>> - use variables for storing MIN() results
> >>> - adapt a documentation bit
> >>>
> >>> Changes from v5:
> >>> - adapt to newer tracing APIs
> >>> - disable ssh compression (mimic what libssh2 does by default)
> >>> - use build time checks for libssh 0.8, and use newer APIs directly
> >>>
> >>> Changes from v4:
> >>> - fix wrong usages of error_setg/session_error_setg/sftp_error_setg
> >>> - fix few return code checks
> >>> - remove now-unused parameters in few internal functions
> >>> - allow authentication with "none" method
> >>> - switch to unsigned int for the port number
> >>> - enable TCP_NODELAY on the socket
> >>> - fix one reference error message in iotest 207
> >>>
> >>> Changes from v3:
> >>> - fix socket cleanup in connect_to_ssh()
> >>> - add comments about the socket cleanup
> >>> - improve the error reporting (closer to what was with libssh2)
> >>> - improve EOF detection on sftp_read()
> >>>
> >>> Changes from v2:
> >>> - used again an own fd
> >>> - fixed co_yield() implementation
> >>>
> >>> Changes from v1:
> >>> - fixed jumbo packets writing
> >>> - fixed missing 'err' assignment
> >>> - fixed commit message
> >>>
> >>>  .travis.yml   |   4 +-
> >>>  block/Makefile.objs   |   6 +-
> >>>  block/ssh.c   | 665 ++
> >>>  block/trace-events|  14 +-
> >>>  configure |  65 +-
> >>>  docs/qemu-block-drivers.texi  |   2 +-
> >>>  .../dockerfiles/debian-win32-cross.docker |   1 -
> >>>  .../dockerfiles/debian-win64-cross.docker |   1 -
> >>>  tests/docker/dockerfiles/fedora.docker|   4 +-
> >>>  tests/docker/dockerfiles/ubuntu.docker|   2 +-
> >>>  tests/docker/dockerfiles/ubuntu1804.docker|   2 +-
> >>>  tests/qemu-iotests/207|   4 +-
> >>>  tests/qemu-iotests/207.out|   2 +-
> >>>  13 files changed, 423 insertions(+), 349 deletions(-)
> >>
> >> [...]
> >>
> >>> diff --git a/block/ssh.c b/block/ssh.c
> >>> index 6da7b9cbfe..644ae8b82c 100644
> >>> --- a/block/ssh.c
> >>> +++ b/block/ssh.c
> >>
> >> [...]
> >>
> >>> +case SSH_SERVER_KNOWN_CHANGED:
> >>> +ret = -EINVAL;
> >>> +r = ssh_get_publickey(s->session, );
> >>> +if (r == 0) {
> >>> +r = ssh_get_publickey_hash(pubkey, SSH_PUBLICKEY_HASH_SHA1,
> >>> +   _hash, _hash_len);
> >>> +pubkey_type = ssh_key_type(pubkey);
> >>> +ssh_key_free(pubkey);
> >>> +}
> >>> +if (r == 0) {
> >>> +fingerprint = 
> >>> ssh_get_fingerprint_hash(SSH_PUBLICKEY_HASH_SHA1,
> >>> +   server_hash,
> >>> +   server_hash_len);
> >>> +ssh_clean_pubkey_hash(_hash);
> >>> +}
> >>> +if (fingerprint) {
> >>> +error_setg(errp,
> >>> +   "host key (%s key with fingerprint %s) does not 
> >>> match "
> >>> +

Re: [Qemu-devel] [PATCH v10] ssh: switch from libssh2 to libssh

2019-06-20 Thread Max Reitz

On 20.06.19 22:03, Pino Toscano wrote:
> On Thursday, 20 June 2019 14:58:40 CEST Max Reitz wrote:
>> On 20.06.19 11:49, Pino Toscano wrote:
>>> On Tuesday, 18 June 2019 15:14:30 CEST Max Reitz wrote:
 On 18.06.19 11:24, Pino Toscano wrote:
> Rewrite the implementation of the ssh block driver to use libssh instead
> of libssh2.  The libssh library has various advantages over libssh2:
> - easier API for authentication (for example for using ssh-agent)
> - easier API for known_hosts handling
> - supports newer types of keys in known_hosts
>
> Use APIs/features available in libssh 0.8 conditionally, to support
> older versions (which are not recommended though).
>
> Adjust the tests according to the different error message, and to the
> newer host keys (ed25519) that are used by default with OpenSSH >= 6.7
> and libssh >= 0.7.0.
>
> Adjust the various Docker/Travis scripts to use libssh when available
> instead of libssh2. The mingw/mxe testing is dropped for now, as there
> are no packages for it.
>
> Signed-off-by: Pino Toscano 
> Tested-by: Philippe Mathieu-Daudé 
> Acked-by: Alex Bennée 
> ---
>
> Changes from v9:
> - restored "default" case in the server status switch for libssh < 0.8.0
> - print the host key type & fingerprint on mismatch with known_hosts
> - improve/fix message for failed socket_set_nodelay()
> - reset s->sock properly
>
> Changes from v8:
> - use a newer key type in iotest 207
> - improve the commit message
>
> Changes from v7:
> - #if HAVE_LIBSSH_0_8 -> #ifdef HAVE_LIBSSH_0_8
> - ptrdiff_t -> size_t
>
> Changes from v6:
> - fixed few checkpatch style issues
> - detect libssh 0.8 via symbol detection
> - adjust travis/docker test material
> - remove dead "default" case in a switch
> - use variables for storing MIN() results
> - adapt a documentation bit
>
> Changes from v5:
> - adapt to newer tracing APIs
> - disable ssh compression (mimic what libssh2 does by default)
> - use build time checks for libssh 0.8, and use newer APIs directly
>
> Changes from v4:
> - fix wrong usages of error_setg/session_error_setg/sftp_error_setg
> - fix few return code checks
> - remove now-unused parameters in few internal functions
> - allow authentication with "none" method
> - switch to unsigned int for the port number
> - enable TCP_NODELAY on the socket
> - fix one reference error message in iotest 207
>
> Changes from v3:
> - fix socket cleanup in connect_to_ssh()
> - add comments about the socket cleanup
> - improve the error reporting (closer to what was with libssh2)
> - improve EOF detection on sftp_read()
>
> Changes from v2:
> - used again an own fd
> - fixed co_yield() implementation
>
> Changes from v1:
> - fixed jumbo packets writing
> - fixed missing 'err' assignment
> - fixed commit message
>
>  .travis.yml   |   4 +-
>  block/Makefile.objs   |   6 +-
>  block/ssh.c   | 665 ++
>  block/trace-events|  14 +-
>  configure |  65 +-
>  docs/qemu-block-drivers.texi  |   2 +-
>  .../dockerfiles/debian-win32-cross.docker |   1 -
>  .../dockerfiles/debian-win64-cross.docker |   1 -
>  tests/docker/dockerfiles/fedora.docker|   4 +-
>  tests/docker/dockerfiles/ubuntu.docker|   2 +-
>  tests/docker/dockerfiles/ubuntu1804.docker|   2 +-
>  tests/qemu-iotests/207|   4 +-
>  tests/qemu-iotests/207.out|   2 +-
>  13 files changed, 423 insertions(+), 349 deletions(-)

 [...]

> diff --git a/block/ssh.c b/block/ssh.c
> index 6da7b9cbfe..644ae8b82c 100644
> --- a/block/ssh.c
> +++ b/block/ssh.c

 [...]

> +case SSH_SERVER_KNOWN_CHANGED:
> +ret = -EINVAL;
> +r = ssh_get_publickey(s->session, );
> +if (r == 0) {
> +r = ssh_get_publickey_hash(pubkey, SSH_PUBLICKEY_HASH_SHA1,
> +   _hash, _hash_len);
> +pubkey_type = ssh_key_type(pubkey);
> +ssh_key_free(pubkey);
> +}
> +if (r == 0) {
> +fingerprint = 
> ssh_get_fingerprint_hash(SSH_PUBLICKEY_HASH_SHA1,
> +   server_hash,
> +   server_hash_len);
> +ssh_clean_pubkey_hash(_hash);
> +}
> +if (fingerprint) {
> +error_setg(errp,
> +   "host key (%s key with fingerprint %s)

[Qemu-devel] [PATCH v11] ssh: switch from libssh2 to libssh

2019-06-20 Thread Pino Toscano

Rewrite the implementation of the ssh block driver to use libssh instead
of libssh2.  The libssh library has various advantages over libssh2:
- easier API for authentication (for example for using ssh-agent)
- easier API for known_hosts handling
- supports newer types of keys in known_hosts

Use APIs/features available in libssh 0.8 conditionally, to support
older versions (which are not recommended though).

Adjust the iotest 207 according to the different error message, and to
find the default key type for localhost (to properly compare the
fingerprint with).
Contributed-by: Max Reitz 

Adjust the various Docker/Travis scripts to use libssh when available
instead of libssh2. The mingw/mxe testing is dropped for now, as there
are no packages for it.

Signed-off-by: Pino Toscano 
Tested-by: Philippe Mathieu-Daudé 
Acked-by: Alex Bennée 
---

Changes from v10:
- improve error message for key mismatch
- integrate Max Reitz' fix to iotest 207 to detect the key type used by
  localhost

Changes from v9:
- restored "default" case in the server status switch for libssh < 0.8.0
- print the host key type & fingerprint on mismatch with known_hosts
- improve/fix message for failed socket_set_nodelay()
- reset s->sock properly

Changes from v8:
- use a newer key type in iotest 207
- improve the commit message

Changes from v7:
- #if HAVE_LIBSSH_0_8 -> #ifdef HAVE_LIBSSH_0_8
- ptrdiff_t -> size_t

Changes from v6:
- fixed few checkpatch style issues
- detect libssh 0.8 via symbol detection
- adjust travis/docker test material
- remove dead "default" case in a switch
- use variables for storing MIN() results
- adapt a documentation bit

Changes from v5:
- adapt to newer tracing APIs
- disable ssh compression (mimic what libssh2 does by default)
- use build time checks for libssh 0.8, and use newer APIs directly

Changes from v4:
- fix wrong usages of error_setg/session_error_setg/sftp_error_setg
- fix few return code checks
- remove now-unused parameters in few internal functions
- allow authentication with "none" method
- switch to unsigned int for the port number
- enable TCP_NODELAY on the socket
- fix one reference error message in iotest 207

Changes from v3:
- fix socket cleanup in connect_to_ssh()
- add comments about the socket cleanup
- improve the error reporting (closer to what was with libssh2)
- improve EOF detection on sftp_read()

Changes from v2:
- used again an own fd
- fixed co_yield() implementation

Changes from v1:
- fixed jumbo packets writing
- fixed missing 'err' assignment
- fixed commit message

 .travis.yml   |   4 +-
 block/Makefile.objs   |   6 +-
 block/ssh.c   | 669 ++
 block/trace-events|  14 +-
 configure |  65 +-
 docs/qemu-block-drivers.texi  |   2 +-
 .../dockerfiles/debian-win32-cross.docker |   1 -
 .../dockerfiles/debian-win64-cross.docker |   1 -
 tests/docker/dockerfiles/fedora.docker|   4 +-
 tests/docker/dockerfiles/ubuntu.docker|   2 +-
 tests/docker/dockerfiles/ubuntu1804.docker|   2 +-
 tests/qemu-iotests/207|  54 +-
 tests/qemu-iotests/207.out|   2 +-
 13 files changed, 468 insertions(+), 358 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index aeb9b211cd..279658b116 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -31,7 +31,7 @@ addons:
   - libseccomp-dev
   - libspice-protocol-dev
   - libspice-server-dev
-  - libssh2-1-dev
+  - libssh-dev
   - liburcu-dev
   - libusb-1.0-0-dev
   - libvte-2.91-dev
@@ -270,7 +270,7 @@ matrix:
 - libseccomp-dev
 - libspice-protocol-dev
 - libspice-server-dev
-- libssh2-1-dev
+- libssh-dev
 - liburcu-dev
 - libusb-1.0-0-dev
 - libvte-2.91-dev
diff --git a/block/Makefile.objs b/block/Makefile.objs
index dbd1522722..35f3bca4d9 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -31,7 +31,7 @@ block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_VXHS) += vxhs.o
-block-obj-$(CONFIG_LIBSSH2) += ssh.o
+block-obj-$(CONFIG_LIBSSH) += ssh.o
 block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
@@ -52,8 +52,8 @@ rbd.o-libs := $(RBD_LIBS)
 gluster.o-cflags   := $(GLUSTERFS_CFLAGS)
 gluster.o-libs := $(GLUSTERFS_LIBS)
 vxhs.o-libs:= $(VXHS_LIBS)
-ssh.o-cflags   := $(LIBSSH2_CFLAGS)
-ssh.o-libs := $(LIBSSH2_LIBS)
+ssh.o-cflags   := $(LIBSSH_CFLAGS)
+ssh.o-libs := $(LIBSSH_LIBS)
 block-obj-dmg-bz2-$(CONFIG_BZIP2) += dmg-bz2.o
 block-obj-$(if $(CONFIG_DMG),m,n) += $(block-obj-dmg-bz2-y)
 dmg-bz2.o-libs := $(BZIP2_LIBS)
diff --git a/block/ssh.c b/block/ssh.c
index 6da7b9cbfe..048d0cc924

[Qemu-devel] [PATCH] ati-vga: Clarify comment (to be squashed in previous patch)

2019-06-20 Thread BALATON Zoltan

Signed-off-by: BALATON Zoltan 
---
 hw/display/ati.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/display/ati.c b/hw/display/ati.c
index 6fbdda288f..932a1eacea 100644
--- a/hw/display/ati.c
+++ b/hw/display/ati.c
@@ -538,9 +538,14 @@ static void ati_mm_write(void *opaque, hwaddr addr,
 case GPIO_MONID ... GPIO_MONID + 3:
 /* FIXME What does Radeon have here? */
 if (s->dev_id == PCI_DEVICE_ID_ATI_RAGE128_PF) {
-/* Rage128p accesses DDC used to get EDID on these pins */
 ati_reg_write_offs(>regs.gpio_monid,
addr - GPIO_MONID, data, size);
+/*
+ * Rage128p accesses DDC used to get EDID via these bits.
+ * Only touch i2c when write overlaps 3rd byte because some
+ * drivers access this reg via multiple partial writes and
+ * without this spurious bits would be sent.
+ */
 if ((s->regs.gpio_monid & BIT(25)) &&
 addr <= GPIO_MONID + 2 && addr + size > GPIO_MONID + 2) {
 s->regs.gpio_monid = ati_i2c(s->bbi2c, s->regs.gpio_monid, 1);
-- 
2.13.7

Re: [Qemu-devel] [PATCH 0/3] add ati vgabios

2019-06-20 Thread BALATON Zoltan


On Thu, 20 Jun 2019, Gerd Hoffmann wrote:

Gerd Hoffmann (3):
 seabios: add config for ati vgabios
 seabios: add ati vgabios binary
 ati-vga: switch to vgabios-ati.bin

hw/display/ati.c|   2 +-
pc-bios/vgabios-ati.bin | Bin 0 -> 38912 bytes
roms/config.vga-ati |   4 
3 files changed, 5 insertions(+), 1 deletion(-)
create mode 100644 pc-bios/vgabios-ati.bin
create mode 100644 roms/config.vga-ati


Tested-by: BALATON Zoltan

Re: [Qemu-devel] [PATCH 11/12] iotests: add test 257 for bitmap-mode backups

2019-06-20 Thread Max Reitz

On 20.06.19 21:08, John Snow wrote:
> 
> 
> On 6/20/19 2:35 PM, Max Reitz wrote:
>> On 20.06.19 03:03, John Snow wrote:
>>> Signed-off-by: John Snow 
>>> ---
>>>  tests/qemu-iotests/257 |  412 +++
>>>  tests/qemu-iotests/257.out | 2199 
>>>  tests/qemu-iotests/group   |1 +
>>>  3 files changed, 2612 insertions(+)
>>>  create mode 100755 tests/qemu-iotests/257
>>>  create mode 100644 tests/qemu-iotests/257.out
>>
>> This test is actually quite nicely written.
>>
> 
> Thanks!
> 
>> I like that I don’t have to read the reference output but can just grep
>> for “error”.
>>
> 
> Me too!! Actually, doing the math for what to expect and verifying the
> output by hand was becoming a major burden, so partially this test
> infrastructure was my attempt to procedurally verify that the results I
> was seeing were what made sense.
> 
> At the end of it, I felt it was nice to keep it in there.
> 
>> Only minor notes below.
>>
>>> diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257
>>> new file mode 100755
>>> index 00..5f7f388504
>>> --- /dev/null
>>> +++ b/tests/qemu-iotests/257
>>
>> [...]
>>
>>> +class PatternGroup:
>>> +"""Grouping of Pattern objects. Initialize with an iterable of 
>>> Patterns."""
>>> +def __init__(self, patterns):
>>> +self.patterns = patterns
>>> +
>>> +def bits(self, granularity):
>>> +"""Calculate the unique bits dirtied by this pattern grouping"""
>>> +res = set()
>>> +for pattern in self.patterns:
>>> +lower = math.floor(pattern.offset / granularity)
>>> +upper = math.floor((pattern.offset + pattern.size - 1) / 
>>> granularity)
>>> +res = res | set(range(lower, upper + 1))
>>
>> Why you’d do floor((x - 1) / y) + 1 has confused me quite a while.
>> Until I realized that oh yeah, Python’s range() is a right-open
>> interval.  I don’t like Python’s range().
>>
> 
> It confuses me constantly, but it's really meant to be used for
> iterating over lengths.

I can see the use for range(x), but not for range(a, b).

(At least it’s not Rust, where [a..b] is [a, b), too – it’s enclosed in
square brackets, it should be closed, damnit.)

> This is somewhat of an abuse of it. I always
> test it out in a console first before using it, just in case.
> 
>> (Yes, you’re right, this is better to read than just ceil(x / y).
>> Because it reminds people like me that range() is weird.)
>>
>>> +return res
>>> +
>>> +GROUPS = [
>>> +PatternGroup([
>>> +# Batch 0: 4 clusters
>>> +mkpattern('0x49', 0x000),
>>> +mkpattern('0x6c', 0x010),   # 1M
>>> +mkpattern('0x6f', 0x200),   # 32M
>>> +mkpattern('0x76', 0x3ff)]), # 64M - 64K
>>> +PatternGroup([
>>> +# Batch 1: 6 clusters (3 new)
>>> +mkpattern('0x65', 0x000),   # Full overwrite
>>> +mkpattern('0x77', 0x00f8000),   # Partial-left (1M-32K)
>>> +mkpattern('0x72', 0x2008000),   # Partial-right (32M+32K)
>>> +mkpattern('0x69', 0x3fe)]), # Adjacent-left (64M - 128K)
>>> +PatternGroup([
>>> +# Batch 2: 7 clusters (3 new)
>>> +mkpattern('0x74', 0x001),   # Adjacent-right
>>> +mkpattern('0x69', 0x00e8000),   # Partial-left  (1M-96K)
>>> +mkpattern('0x6e', 0x2018000),   # Partial-right (32M+96K)
>>> +mkpattern('0x67', 0x3fe,
>>> +  2*GRANULARITY)]), # Overwrite [(64M-128K)-64M)
>>> +PatternGroup([
>>> +# Batch 3: 8 clusters (5 new)
>>> +# Carefully chosen such that nothing re-dirties the one cluster
>>> +# that copies out successfully before failure in Group #1.
>>> +mkpattern('0xaa', 0x001,
>>> +  3*GRANULARITY),   # Overwrite and 2x Adjacent-right
>>> +mkpattern('0xbb', 0x00d8000),   # Partial-left (1M-160K)
>>> +mkpattern('0xcc', 0x2028000),   # Partial-right (32M+160K)
>>> +mkpattern('0xdd', 0x3fc)]), # New; leaving a gap to the right
>>> +]
>>
>> I’d place this four spaces to the left.  But maybe placing it here is
>> proper Python indentation, while moving it to the left would be C
>> indentation.
>>
> 
> Either is fine, I think. In this case it affords us more room for the
> commentary on the bit ranges. Maybe it's not necessary, but at least
> personally I get woozy looking at the bit patterns.

Oh, no, no, I just meant the final closing ”]” of GROUPS.

(I did wonder about why you didn’t place every PatternGroups closing ])
on a separate line, too, but I decided not to say anything, because it
looks Python-y this way.  But you’re right, this gives a nice excuse why
to put more space between the patterns and the comments, which helps.)

>>> +class Drive:
>>> +"""Represents, vaguely, a drive attached to a VM.
>>> +Includes format, graph, and device information."""
>>> +
>>> +def __init__(self, path, vm=None):
>>> +self.path =

Re: [Qemu-devel] [RFC PATCH v1 08/12] target.json: add migrate-set-sev-info command

2019-06-20 Thread Singh, Brijesh



On 6/20/19 2:13 PM, Eric Blake wrote:
> On 6/20/19 1:03 PM, Singh, Brijesh wrote:
>> The command can be used by the hypervisor to specify the target Platform
>> Diffie-Hellman key (PDH) and certificate chain before starting the SEV
>> guest migration. The values passed through the command will be used while
>> creating the outgoing encryption context.
>>
>> Signed-off-by: Brijesh Singh 
>> ---
>>   qapi/target.json   | 18 ++
>>   target/i386/monitor.c  | 10 ++
>>   target/i386/sev-stub.c |  5 +
>>   target/i386/sev.c  | 11 +++
>>   target/i386/sev_i386.h |  9 -
>>   5 files changed, 52 insertions(+), 1 deletion(-)
>>
> 
>> +++ b/qapi/target.json
>> @@ -512,3 +512,21 @@
>>   ##
>>   { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'],
>> 'if': 'defined(TARGET_PPC) || defined(TARGET_ARM) || 
>> defined(TARGET_I386) || defined(TARGET_S390X) || defined(TARGET_MIPS)' }
>> +
>> +##
>> +# @migrate-set-sev-info:
>> +#
>> +# The command is used to provide the target host information used during the
>> +# SEV guest.
>> +#
>> +# @pdh the target host platform diffie-hellman key encoded in base64
>> +#
>> +# @plat-cert the target host platform certificate chain encoded in base64
>> +#
>> +# @amd-cert AMD certificate chain which include ASK and OCA encoded in 
>> base64
>> +#
>> +# Since 4.3
> 
> The next release is 4.1, then likely 4.2 near the end of the calendar
> year, then 5.0 in 2020. There is no planned 4.3 release.  Are you trying
> to get this in 4.1?


Ah, I was meaning to type 4.2 and not 4.3. The series has dependency on
kernel patches, my best effort it to get it ready for 4.2 merge
window.

Re: [Qemu-devel] [PATCH v4 08/13] vfio: Add save state functions to SaveVMHandlers

2019-06-20 Thread Alex Williamson

On Thu, 20 Jun 2019 20:07:36 +0530
Kirti Wankhede  wrote:

> Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy
> functions. These functions handles pre-copy and stop-and-copy phase.
> 
> In _SAVING|_RUNNING device state or pre-copy phase:
> - read pending_bytes
> - read data_offset - indicates kernel driver to write data to staging
>   buffer which is mmapped.

Why is data_offset the trigger rather than data_size?  It seems that
data_offset can't really change dynamically since it might be mmap'd,
so it seems unnatural to bother re-reading it.

> - read data_size - amount of data in bytes written by vendor driver in 
> migration
>   region.
> - if data section is trapped, pread() number of bytes in data_size, from
>   data_offset.
> - if data section is mmaped, read mmaped buffer of size data_size.
> - Write data packet to file stream as below:
> {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data,
> VFIO_MIG_FLAG_END_OF_STATE }
> 
> In _SAVING device state or stop-and-copy phase
> a. read config space of device and save to migration file stream. This
>doesn't need to be from vendor driver. Any other special config state
>from driver can be saved as data in following iteration.
> b. read pending_bytes - indicates kernel driver to write data to staging
>buffer which is mmapped.

Is it pending_bytes or data_offset that triggers the write out of
data?  Why pending_bytes vs data_size?  I was interpreting
pending_bytes as the total data size while data_size is the size
available to read now, so assumed data_size would be more closely
aligned to making the data available.

> c. read data_size - amount of data in bytes written by vendor driver in
>migration region.
> d. if data section is trapped, pread() from data_offset of size data_size.
> e. if data section is mmaped, read mmaped buffer of size data_size.

Should this read as "pread() from data_offset of data_size, or
optionally if mmap is supported on the data area, read data_size from
start of mapped buffer"?  IOW, pread should always work.  Same in
previous section.

> f. Write data packet as below:
>{VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data}
> g. iterate through steps b to f until (pending_bytes > 0)

s/until/while/

> h. Write {VFIO_MIG_FLAG_END_OF_STATE}
> 
> .save_live_iterate runs outside the iothread lock in the migration case, which
> could race with asynchronous call to get dirty page list causing data 
> corruption
> in mapped migration region. Mutex added here to serial migration buffer read
> operation.

Would we be ahead to use different offsets within the region for device
data vs dirty bitmap to avoid this?
 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/migration.c | 212 
> 
>  1 file changed, 212 insertions(+)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index fe0887c27664..0a2f30872316 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -107,6 +107,111 @@ static int vfio_migration_set_state(VFIODevice 
> *vbasedev, uint32_t state)
>  return 0;
>  }
>  
> +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +VFIORegion *region = >region.buffer;
> +uint64_t data_offset = 0, data_size = 0;
> +int ret;
> +
> +ret = pread(vbasedev->fd, _offset, sizeof(data_offset),
> +region->fd_offset + offsetof(struct 
> vfio_device_migration_info,
> + data_offset));
> +if (ret != sizeof(data_offset)) {
> +error_report("Failed to get migration buffer data offset %d",
> + ret);
> +return -EINVAL;
> +}
> +
> +ret = pread(vbasedev->fd, _size, sizeof(data_size),
> +region->fd_offset + offsetof(struct 
> vfio_device_migration_info,
> + data_size));
> +if (ret != sizeof(data_size)) {
> +error_report("Failed to get migration buffer data size %d",
> + ret);
> +return -EINVAL;
> +}
> +
> +if (data_size > 0) {
> +void *buf = NULL;
> +bool buffer_mmaped = false;
> +
> +if (region->mmaps) {
> +int i;
> +
> +for (i = 0; i < region->nr_mmaps; i++) {
> +if ((data_offset >= region->mmaps[i].offset) &&
> +(data_offset < region->mmaps[i].offset +
> +   region->mmaps[i].size)) {
> +buf = region->mmaps[i].mmap + (data_offset -
> +   region->mmaps[i].offset);

So you're expecting that data_offset is somewhere within the data
area.  Why doesn't the data always simply start at the beginning of the
data area?  ie. data_offset would coincide with the beginning of the
mmap'able area (if supported) and be static.  Does this enable

Re: [Qemu-devel] [RFC PATCH v1 08/12] target.json: add migrate-set-sev-info command

2019-06-20 Thread Eric Blake

On 6/20/19 1:03 PM, Singh, Brijesh wrote:
> The command can be used by the hypervisor to specify the target Platform
> Diffie-Hellman key (PDH) and certificate chain before starting the SEV
> guest migration. The values passed through the command will be used while
> creating the outgoing encryption context.
> 
> Signed-off-by: Brijesh Singh 
> ---
>  qapi/target.json   | 18 ++
>  target/i386/monitor.c  | 10 ++
>  target/i386/sev-stub.c |  5 +
>  target/i386/sev.c  | 11 +++
>  target/i386/sev_i386.h |  9 -
>  5 files changed, 52 insertions(+), 1 deletion(-)
> 

> +++ b/qapi/target.json
> @@ -512,3 +512,21 @@
>  ##
>  { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'],
>'if': 'defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_I386) 
> || defined(TARGET_S390X) || defined(TARGET_MIPS)' }
> +
> +##
> +# @migrate-set-sev-info:
> +#
> +# The command is used to provide the target host information used during the
> +# SEV guest.
> +#
> +# @pdh the target host platform diffie-hellman key encoded in base64
> +#
> +# @plat-cert the target host platform certificate chain encoded in base64
> +#
> +# @amd-cert AMD certificate chain which include ASK and OCA encoded in base64
> +#
> +# Since 4.3

The next release is 4.1, then likely 4.2 near the end of the calendar
year, then 5.0 in 2020. There is no planned 4.3 release.  Are you trying
to get this in 4.1?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 07/12] block/backup: add 'always' bitmap sync policy

2019-06-20 Thread Max Reitz

On 20.06.19 20:44, John Snow wrote:
> 
> 
> On 6/20/19 1:00 PM, Max Reitz wrote:
>> On 20.06.19 03:03, John Snow wrote:
>>> This adds an "always" policy for bitmap synchronization. Regardless of if
>>> the job succeeds or fails, the bitmap is *always* synchronized. This means
>>> that for backups that fail part-way through, the bitmap retains a record of
>>> which sectors need to be copied out to accomplish a new backup using the
>>> old, partial result.
>>>
>>> In effect, this allows us to "resume" a failed backup; however the new 
>>> backup
>>> will be from the new point in time, so it isn't a "resume" as much as it is
>>> an "incremental retry." This can be useful in the case of extremely large
>>> backups that fail considerably through the operation and we'd like to not 
>>> waste
>>> the work that was already performed.
>>>
>>> Signed-off-by: John Snow 
>>> ---
>>>  qapi/block-core.json |  5 -
>>>  block/backup.c   | 10 ++
>>>  2 files changed, 10 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index 0332dcaabc..58d267f1f5 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -1143,6 +1143,9 @@
>>>  # An enumeration of possible behaviors for the synchronization of a bitmap
>>>  # when used for data copy operations.
>>>  #
>>> +# @always: The bitmap is always synchronized with remaining blocks to copy,
>>> +#  whether or not the operation has completed successfully or not.
>>> +#
>>>  # @conditional: The bitmap is only synchronized when the operation is 
>>> successul.
>>>  #   This is useful for Incremental semantics.
>>>  #
>>> @@ -1153,7 +1156,7 @@
>>>  # Since: 4.1
>>>  ##
>>>  { 'enum': 'BitmapSyncMode',
>>> -  'data': ['conditional', 'never'] }
>>> +  'data': ['always', 'conditional', 'never'] }
>>>  
>>>  ##
>>>  # @MirrorCopyMode:
>>> diff --git a/block/backup.c b/block/backup.c
>>> index 627f724b68..beb2078696 100644
>>> --- a/block/backup.c
>>> +++ b/block/backup.c
>>> @@ -266,15 +266,17 @@ static void backup_cleanup_sync_bitmap(BackupBlockJob 
>>> *job, int ret)
>>>  BlockDriverState *bs = blk_bs(job->common.blk);
>>>  
>>>  if (ret < 0 || job->bitmap_mode == BITMAP_SYNC_MODE_NEVER) {
>>> -/* Failure, or we don't want to synchronize the bitmap.
>>> - * Merge the successor back into the parent, delete nothing. */
>>> +/* Failure, or we don't want to synchronize the bitmap. */
>>> +if (job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS) {
>>> +bdrv_dirty_bitmap_claim(job->sync_bitmap, >copy_bitmap);
>>
>> Hmm...  OK, bitmaps in backup always confuse me, so bear with me, please.
>>
> 
> I realize this is an extremely dense section that actually covers a
> *lot* of pathways.
> 
>> (Hi, I’m a time traveler from the end of this section and I can tell you
>> that everything is fine.  I was just confused.  I’ll still keep this
>> here, because it was so much work.)
>>
>> The copy_bitmap is copied from the sync_bitmap at the beginning, so the
>> sync_bitmap can continue to be dirtied, but that won’t affect the job.
>> In normal incremental mode, this means that the sync point is always at
>> the beginning of the job.  (Well, naturally, because that’s how backup
>> is supposed to go.)
>>
> 
> sync_bitmap: This is used as an initial manifest for which sectors to
> copy out. It is the user-provided bitmap. We actually *never* edit this
> bitmap in the body of the job.
> 
> copy_bitmap: This is the manifest for which blocks remain to be copied
> out. We clear bits in this as we go, because we use it as our loop
> condition.
> 
> So what you say is actually only half-true: the sync_bitmap actually
> remains static during the duration of the job, and it has an anonymous
> child that accrues new writes. This is a holdover from before we had a
> copy_bitmap, and we used to use a sync_bitmap directly as our loop
> condition.
> 
> (This could be simplified upstream at present; but after this patch it
> cannot be for reasons explained below. We do wish to maintain three
> distinct sets of bits:
> 1. The bits at the start of the operation,
> 2. The bits accrued during the operation, and
> 3. The bits that remain to be, or were not, copied during the operation.)
> 
> So there's actually three bitmaps:
> 
> - sync_bitmap: actually just static and read-only
> - sync_bitmap's anonymous child: accrues new writes.

Ah, right...  Thanks for writing that up.

> - copy_bitmap: loop conditional.
> 
>> But then replacing the sync_bitmap with the copy_bitmap here means that
>> all of these dirtyings that happened during the job are lost.  Hmm, but
>> that doesn’t matter, does it?  Because whenever something was dirtied in
>> sync_bitmap, the corresponding area must have been copied to the backup
>> due to the job.
>>
> 
> The new dirty bits were accrued very secretly in the anonymous child.
> The new dirty bits are merged in via the reclaim() function.
> 
> So, what

Re: [Qemu-devel] [PATCH 11/12] iotests: add test 257 for bitmap-mode backups

2019-06-20 Thread John Snow




On 6/20/19 2:35 PM, Max Reitz wrote:
> On 20.06.19 03:03, John Snow wrote:
>> Signed-off-by: John Snow 
>> ---
>>  tests/qemu-iotests/257 |  412 +++
>>  tests/qemu-iotests/257.out | 2199 
>>  tests/qemu-iotests/group   |1 +
>>  3 files changed, 2612 insertions(+)
>>  create mode 100755 tests/qemu-iotests/257
>>  create mode 100644 tests/qemu-iotests/257.out
> 
> This test is actually quite nicely written.
> 

Thanks!

> I like that I don’t have to read the reference output but can just grep
> for “error”.
> 

Me too!! Actually, doing the math for what to expect and verifying the
output by hand was becoming a major burden, so partially this test
infrastructure was my attempt to procedurally verify that the results I
was seeing were what made sense.

At the end of it, I felt it was nice to keep it in there.

> Only minor notes below.
> 
>> diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257
>> new file mode 100755
>> index 00..5f7f388504
>> --- /dev/null
>> +++ b/tests/qemu-iotests/257
> 
> [...]
> 
>> +class PatternGroup:
>> +"""Grouping of Pattern objects. Initialize with an iterable of 
>> Patterns."""
>> +def __init__(self, patterns):
>> +self.patterns = patterns
>> +
>> +def bits(self, granularity):
>> +"""Calculate the unique bits dirtied by this pattern grouping"""
>> +res = set()
>> +for pattern in self.patterns:
>> +lower = math.floor(pattern.offset / granularity)
>> +upper = math.floor((pattern.offset + pattern.size - 1) / 
>> granularity)
>> +res = res | set(range(lower, upper + 1))
> 
> Why you’d do floor((x - 1) / y) + 1 has confused me quite a while.
> Until I realized that oh yeah, Python’s range() is a right-open
> interval.  I don’t like Python’s range().
> 

It confuses me constantly, but it's really meant to be used for
iterating over lengths. This is somewhat of an abuse of it. I always
test it out in a console first before using it, just in case.

> (Yes, you’re right, this is better to read than just ceil(x / y).
> Because it reminds people like me that range() is weird.)
> 
>> +return res
>> +
>> +GROUPS = [
>> +PatternGroup([
>> +# Batch 0: 4 clusters
>> +mkpattern('0x49', 0x000),
>> +mkpattern('0x6c', 0x010),   # 1M
>> +mkpattern('0x6f', 0x200),   # 32M
>> +mkpattern('0x76', 0x3ff)]), # 64M - 64K
>> +PatternGroup([
>> +# Batch 1: 6 clusters (3 new)
>> +mkpattern('0x65', 0x000),   # Full overwrite
>> +mkpattern('0x77', 0x00f8000),   # Partial-left (1M-32K)
>> +mkpattern('0x72', 0x2008000),   # Partial-right (32M+32K)
>> +mkpattern('0x69', 0x3fe)]), # Adjacent-left (64M - 128K)
>> +PatternGroup([
>> +# Batch 2: 7 clusters (3 new)
>> +mkpattern('0x74', 0x001),   # Adjacent-right
>> +mkpattern('0x69', 0x00e8000),   # Partial-left  (1M-96K)
>> +mkpattern('0x6e', 0x2018000),   # Partial-right (32M+96K)
>> +mkpattern('0x67', 0x3fe,
>> +  2*GRANULARITY)]), # Overwrite [(64M-128K)-64M)
>> +PatternGroup([
>> +# Batch 3: 8 clusters (5 new)
>> +# Carefully chosen such that nothing re-dirties the one cluster
>> +# that copies out successfully before failure in Group #1.
>> +mkpattern('0xaa', 0x001,
>> +  3*GRANULARITY),   # Overwrite and 2x Adjacent-right
>> +mkpattern('0xbb', 0x00d8000),   # Partial-left (1M-160K)
>> +mkpattern('0xcc', 0x2028000),   # Partial-right (32M+160K)
>> +mkpattern('0xdd', 0x3fc)]), # New; leaving a gap to the right
>> +]
> 
> I’d place this four spaces to the left.  But maybe placing it here is
> proper Python indentation, while moving it to the left would be C
> indentation.
> 

Either is fine, I think. In this case it affords us more room for the
commentary on the bit ranges. Maybe it's not necessary, but at least
personally I get woozy looking at the bit patterns.

>> +class Drive:
>> +"""Represents, vaguely, a drive attached to a VM.
>> +Includes format, graph, and device information."""
>> +
>> +def __init__(self, path, vm=None):
>> +self.path = path
>> +self.vm = vm
>> +self.fmt = None
>> +self.size = None
>> +self.node = None
>> +self.device = None
>> +
>> +@property
>> +def name(self):
>> +return self.node or self.device
>> +
>> +def img_create(self, fmt, size):
>> +self.fmt = fmt
>> +self.size = size
>> +iotests.qemu_img_create('-f', self.fmt, self.path, str(self.size))
>> +
>> +def create_target(self, name, fmt, size):
>> +basename = os.path.basename(self.path)
>> +file_node_name = "file_{}".format(basename)
>> +vm = self.vm
>> +
>> +log(vm.command('blockdev-create', job_id='bdc-file-job',
>> +

[Qemu-devel] [PATCH v5 4/5] net/announce: Add HMP optional ID

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Add the optional ID to the HMP command.

e.g.
   # start an announce for a long time on eth1
   migrate_set_parameter announce-rounds 1000
   announce_self "eth1" e1

   # start an announce on eth2
   announce_self "eth2" e2

   # Change e1 to be announcing on eth1 and eth3
   announce_self "eth1,eth3" e1

   # Cancel e1
   migrate_set_parameter announce-rounds 0
   announce_self "" e1

Signed-off-by: Dr. David Alan Gilbert 
---
 hmp-commands.hx| 7 ---
 monitor/hmp-cmds.c | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index c2a2df9708..61d0be29d8 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -955,8 +955,8 @@ ETEXI
 
 {
 .name   = "announce_self",
-.args_type  = "interfaces:s?",
-.params = "[interfaces]",
+.args_type  = "interfaces:s?,id:s?",
+.params = "[interfaces] [id]",
 .help   = "Trigger GARP/RARP announcements",
 .cmd= hmp_announce_self,
 },
@@ -968,7 +968,8 @@ Trigger a round of GARP/RARP broadcasts; this is useful for 
explicitly updating
 network infrastructure after a reconfiguration or some forms of migration.
 The timings of the round are set by the migration announce parameters.
 An optional comma separated @var{interfaces} list restricts the announce to the
-named set of interfaces.
+named set of interfaces. An optional @var{id} can be used to start a separate 
announce
+timer and to change the parameters of it later.
 ETEXI
 
 {
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index a3d34b12fe..96715c7103 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -1669,12 +1669,15 @@ void hmp_info_snapshots(Monitor *mon, const QDict 
*qdict)
 void hmp_announce_self(Monitor *mon, const QDict *qdict)
 {
 const char *interfaces_str = qdict_get_try_str(qdict, "interfaces");
+const char *id = qdict_get_try_str(qdict, "id");
 AnnounceParameters *params = QAPI_CLONE(AnnounceParameters,
 migrate_announce_params());
 
 qapi_free_strList(params->interfaces);
 params->interfaces = strList_from_comma_list(interfaces_str);
 params->has_interfaces = params->interfaces != NULL;
+params->id = g_strdup(id);
+params->has_id = !!params->id;
 qmp_announce_self(params, NULL);
 qapi_free_AnnounceParameters(params);
 }
-- 
2.21.0

Re: [Qemu-devel] [PATCH 08/12] iotests: add testing shim for script-style python tests

2019-06-20 Thread Max Reitz

On 20.06.19 20:47, John Snow wrote:
> 
> 
> On 6/20/19 1:26 PM, Max Reitz wrote:
>> On 20.06.19 19:09, Max Reitz wrote:
>>> On 20.06.19 03:03, John Snow wrote:
 Because the new-style python tests don't use the iotests.main() test
 launcher, we don't turn on the debugger logging for these scripts
 when invoked via ./check -d.

 Refactor the launcher shim into new and old style shims so that they
 share environmental configuration.

 Two cleanup notes: debug was not actually used as a global, and there
 was no reason to create a class in an inner scope just to achieve
 default variables; we can simply create an instance of the runner with
 the values we want instead.

 Signed-off-by: John Snow 
 ---
  tests/qemu-iotests/iotests.py | 40 +++
  1 file changed, 26 insertions(+), 14 deletions(-)
>>>
>>> I don’t quite get how script_main() works (yes, both my Pythonfu and my
>>> Googlefu are that bad), but it works and looks good, so have a
>>
>> Oh, it doesn’t work (well, not automagically).  I just assumed seeing
>> the log output means it’s working.  Seeing that the test needs to call
>> iotests.script_main() explicitly does clear up my confusion.
>>
>> All OK with me.
>>
>> Max
>>
> 
> Yes. I should convert the others to opt-in to the new format so that
> copy-paste in the future will get us the right paradigm.
> 
> Tests just need to be refactored to have a single point of entry so it
> can be passed as a closure to the test runner.
> 
> If this seems like a good change I will do that as a follow-up series
> with only the churn.

It does seem good to me.  Not even because of the test runner, but maybe
even more so because it seems like better style to split the tests into
one function per case.

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v4 13/13] vfio: Add trace events in migration code path

2019-06-20 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 

Thanks, adding traces really helps; however, it might be easier
if you just add them in your previous patches where you're
adding the functions.

Dave

> ---
>  hw/vfio/migration.c  | 26 ++
>  hw/vfio/trace-events | 18 ++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index 68775b5dec11..70c03f1a969f 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -21,6 +21,7 @@
>  #include "exec/ramlist.h"
>  #include "exec/ram_addr.h"
>  #include "pci.h"
> +#include "trace.h"
>  
>  /*
>   * Flags used as delimiter:
> @@ -104,6 +105,7 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, 
> uint32_t state)
>  }
>  
>  vbasedev->device_state = state;
> +trace_vfio_migration_set_state(vbasedev->name, state);
>  return 0;
>  }
>  
> @@ -173,6 +175,8 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice 
> *vbasedev)
>  qemu_put_be64(f, data_size);
>  }
>  
> +trace_vfio_save_buffer(vbasedev->name, data_offset, data_size,
> +   migration->pending_bytes);
>  ret = qemu_file_get_error(f);
>  
>  return data_size;
> @@ -195,6 +199,7 @@ static int vfio_update_pending(VFIODevice *vbasedev)
>  }
>  
>  migration->pending_bytes = pending_bytes;
> +trace_vfio_update_pending(vbasedev->name, pending_bytes);
>  return 0;
>  }
>  
> @@ -209,6 +214,8 @@ static int vfio_save_device_config_state(QEMUFile *f, 
> void *opaque)
>  }
>  qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>  
> +trace_vfio_save_device_config_state(vbasedev->name);
> +
>  return qemu_file_get_error(f);
>  }
>  
> @@ -225,6 +232,7 @@ static int vfio_load_device_config_state(QEMUFile *f, 
> void *opaque)
>  return -EINVAL;
>  }
>  
> +trace_vfio_load_device_config_state(vbasedev->name);
>  return qemu_file_get_error(f);
>  }
>  
> @@ -343,6 +351,9 @@ void vfio_get_dirty_page_list(VFIODevice *vbasedev,
>  }
>  } while (count < pfn_count);
>  
> +trace_vfio_get_dirty_page_list(vbasedev->name, start_pfn, pfn_count,
> +   page_size);
> +
>  dpl_unlock:
>  qemu_mutex_unlock(>lock);
>  }
> @@ -390,6 +401,7 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
>  return ret;
>  }
>  
> +trace_vfio_save_setup(vbasedev->name);
>  return 0;
>  }
>  
> @@ -401,6 +413,7 @@ static void vfio_save_cleanup(void *opaque)
>  if (migration->region.buffer.mmaps) {
>  vfio_region_unmap(>region.buffer);
>  }
> +trace_vfio_cleanup(vbasedev->name);
>  }
>  
>  static void vfio_save_pending(QEMUFile *f, void *opaque,
> @@ -424,6 +437,7 @@ static void vfio_save_pending(QEMUFile *f, void *opaque,
>  *res_postcopy_only += migration->pending_bytes;
>  }
>  *res_compatible += 0;
> +trace_vfio_save_pending(vbasedev->name);
>  }
>  
>  static int vfio_save_iterate(QEMUFile *f, void *opaque)
> @@ -451,6 +465,7 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
>  return ret;
>  }
>  
> +trace_vfio_save_iterate(vbasedev->name);
>  return ret;
>  }
>  
> @@ -504,6 +519,8 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
> *opaque)
>  error_report("Failed to set state STOPPED");
>  return ret;
>  }
> +
> +trace_vfio_save_complete_precopy(vbasedev->name);
>  return ret;
>  }
>  
> @@ -544,6 +561,9 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int 
> version_id)
>  
>  data = qemu_get_be64(f);
>  while (data != VFIO_MIG_FLAG_END_OF_STATE) {
> +
> +trace_vfio_load_state(vbasedev->name, data);
> +
>  switch (data) {
>  case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
>  {
> @@ -627,6 +647,8 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int 
> version_id)
>  return -EINVAL;
>  }
>  }
> +trace_vfio_load_state_device_data(vbasedev->name, data_offset,
> +  data_size);
>  break;
>  }
>  }
> @@ -668,6 +690,7 @@ static void vfio_vmstate_change(void *opaque, int 
> running, RunState state)
>  }
>  
>  vbasedev->vm_running = running;
> +trace_vfio_vmstate_change(vbasedev->name, running);
>  }
>  
>  static void vfio_migration_state_notifier(Notifier *notifier, void *data)
> @@ -676,6 +699,8 @@ static void vfio_migration_state_notifier(Notifier 
> *notifier, void *data)
>  VFIODevice *vbasedev = container_of(notifier, VFIODevice, 
> migration_state);
>  int ret;
>  
> +trace_vfio_migration_state_notifier(vbasedev->name, s->state);
> +
>  switch (s->state) {
>  case MIGRATION_STATUS_ACTIVE:
>  if (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING) {
> @@ -758,6 +783,7 @@ int

[Qemu-devel] [PATCH v5 1/5] net/announce: Allow optional list of interfaces

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Allow the caller to restrict the set of interfaces that announces are
sent on.  The default is still to send on all interfaces.

e.g.

  { "execute": "announce-self", "arguments": { "initial": 50, "max": 550, 
"rounds": 5, "step": 50, "interfaces": ["vn2", "vn1"] } }

This doesn't affect the behaviour of migraiton announcments.

Note: There's still only one timer for the qmp command, so that
performing an 'announce-self' on one list of interfaces followed
by another 'announce-self' on another list will stop the announces
on the existing set.

Signed-off-by: Dr. David Alan Gilbert 
---
 include/net/announce.h |  2 +-
 net/announce.c | 39 ---
 net/trace-events   |  2 +-
 qapi/net.json  | 11 ---
 4 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/include/net/announce.h b/include/net/announce.h
index 04a035f679..773470428b 100644
--- a/include/net/announce.h
+++ b/include/net/announce.h
@@ -22,7 +22,7 @@ struct AnnounceTimer {
 /* Returns: update the timer to the next time point */
 int64_t qemu_announce_timer_step(AnnounceTimer *timer);
 
-/* Delete the underlying timer */
+/* Delete the underlying timer and other data */
 void qemu_announce_timer_del(AnnounceTimer *timer);
 
 /*
diff --git a/net/announce.c b/net/announce.c
index 91e9a6e267..1ce42b571d 100644
--- a/net/announce.c
+++ b/net/announce.c
@@ -38,6 +38,8 @@ void qemu_announce_timer_del(AnnounceTimer *timer)
 timer_free(timer->tm);
 timer->tm = NULL;
 }
+qapi_free_strList(timer->params.interfaces);
+timer->params.interfaces = NULL;
 }
 
 /*
@@ -96,24 +98,47 @@ static int announce_self_create(uint8_t *buf,
 
 static void qemu_announce_self_iter(NICState *nic, void *opaque)
 {
+AnnounceTimer *timer = opaque;
 uint8_t buf[60];
 int len;
+bool skip;
+
+if (timer->params.has_interfaces) {
+strList *entry = timer->params.interfaces;
+/* Skip unless we find our name in the requested list */
+skip = true;
+
+while (entry) {
+if (!strcmp(entry->value, nic->ncs->name)) {
+/* Found us */
+skip = false;
+break;
+}
+entry = entry->next;
+}
+} else {
+skip = false;
+}
+
+trace_qemu_announce_self_iter(nic->ncs->name,
+  qemu_ether_ntoa(>conf->macaddr), skip);
 
-trace_qemu_announce_self_iter(qemu_ether_ntoa(>conf->macaddr));
-len = announce_self_create(buf, nic->conf->macaddr.a);
+if (!skip) {
+len = announce_self_create(buf, nic->conf->macaddr.a);
 
-qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
+qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
 
-/* if the NIC provides it's own announcement support, use it as well */
-if (nic->ncs->info->announce) {
-nic->ncs->info->announce(nic->ncs);
+/* if the NIC provides it's own announcement support, use it as well */
+if (nic->ncs->info->announce) {
+nic->ncs->info->announce(nic->ncs);
+}
 }
 }
 static void qemu_announce_self_once(void *opaque)
 {
 AnnounceTimer *timer = (AnnounceTimer *)opaque;
 
-qemu_foreach_nic(qemu_announce_self_iter, NULL);
+qemu_foreach_nic(qemu_announce_self_iter, timer);
 
 if (--timer->round) {
 qemu_announce_timer_step(timer);
diff --git a/net/trace-events b/net/trace-events
index a7937f3f3a..875ef2a0f3 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -1,7 +1,7 @@
 # See docs/devel/tracing.txt for syntax documentation.
 
 # announce.c
-qemu_announce_self_iter(const char *mac) "%s"
+qemu_announce_self_iter(const char *name, const char *mac, int skip) "%s:%s 
skip: %d"
 
 # vhost-user.c
 vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
diff --git a/qapi/net.json b/qapi/net.json
index 5f7bff1637..6f2cd4f530 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -699,6 +699,9 @@
 #
 # @step: Delay increase (in ms) after each self-announcement attempt
 #
+# @interfaces: An optional list of interface names, which restricts the
+#announcement to the listed interfaces. (Since 4.1)
+#
 # Since: 4.0
 ##
 
@@ -706,7 +709,8 @@
   'data': { 'initial': 'int',
 'max': 'int',
 'rounds': 'int',
-'step': 'int' } }
+'step': 'int',
+'*interfaces': ['str'] } }
 
 ##
 # @announce-self:
@@ -718,9 +722,10 @@
 #
 # Example:
 #
-# -> { "execute": "announce-self"
+# -> { "execute": "announce-self",
 #  "arguments": {
-#  "initial": 50, "max": 550, "rounds": 10, "step": 50 } }
+#  "initial": 50, "max": 550, "rounds": 10, "step": 50,
+#  "interfaces": ["vn2", "vn3"] } }
 # <- { "return": {} }
 #
 # Since: 4.0
-- 
2.21.0

Re: [Qemu-devel] [PATCH 08/12] iotests: add testing shim for script-style python tests

2019-06-20 Thread John Snow




On 6/20/19 1:26 PM, Max Reitz wrote:
> On 20.06.19 19:09, Max Reitz wrote:
>> On 20.06.19 03:03, John Snow wrote:
>>> Because the new-style python tests don't use the iotests.main() test
>>> launcher, we don't turn on the debugger logging for these scripts
>>> when invoked via ./check -d.
>>>
>>> Refactor the launcher shim into new and old style shims so that they
>>> share environmental configuration.
>>>
>>> Two cleanup notes: debug was not actually used as a global, and there
>>> was no reason to create a class in an inner scope just to achieve
>>> default variables; we can simply create an instance of the runner with
>>> the values we want instead.
>>>
>>> Signed-off-by: John Snow 
>>> ---
>>>  tests/qemu-iotests/iotests.py | 40 +++
>>>  1 file changed, 26 insertions(+), 14 deletions(-)
>>
>> I don’t quite get how script_main() works (yes, both my Pythonfu and my
>> Googlefu are that bad), but it works and looks good, so have a
> 
> Oh, it doesn’t work (well, not automagically).  I just assumed seeing
> the log output means it’s working.  Seeing that the test needs to call
> iotests.script_main() explicitly does clear up my confusion.
> 
> All OK with me.
> 
> Max
> 

Yes. I should convert the others to opt-in to the new format so that
copy-paste in the future will get us the right paradigm.

Tests just need to be refactored to have a single point of entry so it
can be passed as a closure to the test runner.

If this seems like a good change I will do that as a follow-up series
with only the churn.

--js

[Qemu-devel] [PATCH v5 5/5] net/announce: Expand test for stopping self announce

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Expand self-announce test to check we can stop an announce timer.
We set it up to send 300 packets, but after we receive
the first one we tell it to stop.

We error if:
   a) We receive more than 30 of the packets
   b) We're still receiving packets after a lot longer than the
  30 seconds should have arrived

Signed-off-by: Dr. David Alan Gilbert 
---
 tests/virtio-net-test.c | 57 ++---
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/tests/virtio-net-test.c b/tests/virtio-net-test.c
index 663cf7ea7e..7aa9622f30 100644
--- a/tests/virtio-net-test.c
+++ b/tests/virtio-net-test.c
@@ -184,21 +184,72 @@ static void announce_self(void *obj, void *data, 
QGuestAllocator *t_alloc)
 QDict *rsp;
 int ret;
 uint16_t *proto = (uint16_t *)[12];
+size_t total_received = 0;
+uint64_t start, now, last_rxt, deadline;
 
+/* Send a set of packets over a few second period */
 rsp = qmp("{ 'execute' : 'announce-self', "
   " 'arguments': {"
-  " 'initial': 50, 'max': 550,"
-  " 'rounds': 10, 'step': 50 } }");
+  " 'initial': 20, 'max': 100,"
+  " 'rounds': 300, 'step': 10, 'id': 'bob' } }");
 assert(!qdict_haskey(rsp, "error"));
 qobject_unref(rsp);
 
-/* Catch the packet and make sure it's a RARP */
+/* Catch the first packet and make sure it's a RARP */
 ret = qemu_recv(sv[0], , sizeof(len), 0);
 g_assert_cmpint(ret, ==,  sizeof(len));
 len = ntohl(len);
 
 ret = qemu_recv(sv[0], buffer, len, 0);
 g_assert_cmpint(*proto, ==, htons(ETH_P_RARP));
+
+/*
+ * Stop the announcment by settings rounds to 0 on the
+ * existing timer.
+ */
+rsp = qmp("{ 'execute' : 'announce-self', "
+  " 'arguments': {"
+  " 'initial': 20, 'max': 100,"
+  " 'rounds': 0, 'step': 10, 'id': 'bob' } }");
+assert(!qdict_haskey(rsp, "error"));
+qobject_unref(rsp);
+
+/* Now make sure the packets stop */
+
+/* Times are in us */
+start = g_get_monotonic_time();
+/* 30 packets, max gap 100ms, * 4 for wiggle */
+deadline = start + 1000 * (100 * 30 * 4);
+last_rxt = start;
+
+while (true) {
+int saved_err;
+ret = qemu_recv(sv[0], buffer, 60, MSG_DONTWAIT);
+saved_err = errno;
+now = g_get_monotonic_time();
+g_assert_cmpint(now, <, deadline);
+
+if (ret >= 0) {
+if (ret) {
+last_rxt = now;
+}
+total_received += ret;
+
+/* Check it's not spewing loads */
+g_assert_cmpint(total_received, <, 60 * 30 * 2);
+} else {
+g_assert_cmpint(saved_err, ==, EAGAIN);
+
+/* 400ms, i.e. 4 worst case gaps */
+if ((now - last_rxt) > (1000 * 100 * 4)) {
+/* Nothings arrived for a while - must have stopped */
+break;
+};
+
+/* 100ms */
+g_usleep(1000 * 100);
+}
+};
 }
 
 static void virtio_net_test_cleanup(void *sockets)
-- 
2.21.0

[Qemu-devel] [PATCH v5 2/5] net/announce: Add HMP optional interface list

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Add the optional interface list to the HMP command.

i.e.

   All interfaces
announce_self

   Just the named interfaces:
announce_self vn1,vn2

Signed-off-by: Dr. David Alan Gilbert 
---
 hmp-commands.hx|  6 --
 monitor/hmp-cmds.c | 38 +-
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 810b7b9283..c2a2df9708 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -955,8 +955,8 @@ ETEXI
 
 {
 .name   = "announce_self",
-.args_type  = "",
-.params = "",
+.args_type  = "interfaces:s?",
+.params = "[interfaces]",
 .help   = "Trigger GARP/RARP announcements",
 .cmd= hmp_announce_self,
 },
@@ -967,6 +967,8 @@ STEXI
 Trigger a round of GARP/RARP broadcasts; this is useful for explicitly 
updating the
 network infrastructure after a reconfiguration or some forms of migration.
 The timings of the round are set by the migration announce parameters.
+An optional comma separated @var{interfaces} list restricts the announce to the
+named set of interfaces.
 ETEXI
 
 {
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index a7ae586723..a3d34b12fe 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -27,6 +27,7 @@
 #include "monitor/monitor-internal.h"
 #include "monitor/qdev.h"
 #include "qapi/error.h"
+#include "qapi/clone-visitor.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/qapi-builtin-visit.h"
 #include "qapi/qapi-commands-block.h"
@@ -38,6 +39,7 @@
 #include "qapi/qapi-commands-run-state.h"
 #include "qapi/qapi-commands-tpm.h"
 #include "qapi/qapi-commands-ui.h"
+#include "qapi/qapi-visit-net.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/string-input-visitor.h"
@@ -67,6 +69,32 @@ static void hmp_handle_error(Monitor *mon, Error **errp)
 }
 }
 
+/*
+ * Produce a strList from a comma separated list.
+ * A NULL or empty input string return NULL.
+ */
+static strList *strList_from_comma_list(const char *in)
+{
+strList *res = NULL;
+strList **hook = 
+
+while (in && in[0]) {
+char *comma = strchr(in, ',');
+*hook = g_new0(strList, 1);
+
+if (comma) {
+(*hook)->value = g_strndup(in, comma - in);
+in = comma + 1; /* skip the , */
+} else {
+(*hook)->value = g_strdup(in);
+in = NULL;
+}
+hook = &(*hook)->next;
+}
+
+return res;
+}
+
 void hmp_info_name(Monitor *mon, const QDict *qdict)
 {
 NameInfo *info;
@@ -1640,7 +1668,15 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict)
 
 void hmp_announce_self(Monitor *mon, const QDict *qdict)
 {
-qmp_announce_self(migrate_announce_params(), NULL);
+const char *interfaces_str = qdict_get_try_str(qdict, "interfaces");
+AnnounceParameters *params = QAPI_CLONE(AnnounceParameters,
+migrate_announce_params());
+
+qapi_free_strList(params->interfaces);
+params->interfaces = strList_from_comma_list(interfaces_str);
+params->has_interfaces = params->interfaces != NULL;
+qmp_announce_self(params, NULL);
+qapi_free_AnnounceParameters(params);
 }
 
 void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
-- 
2.21.0

[Qemu-devel] [PATCH v5 0/5] network announce; interface selection & IDs

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Up until now, the 'announce' feature has mainly been used
for migration where we announce on all interfaces.  Another
use for 'announce' is in cases of network topology changes.

Since network topology changes may only affect a subset
of the interfaces, we add an 'interface list' to announce
to restrict the announcment to the interfaces we're interested
in.

Multiple topology changes might happen in close succession,
so we allow multiple timers, each with their own parameters
(including the interface list).

Signed-off-by: Dr. David Alan Gilbert 

v5
  Minor review fixes [Jason] 

Dr. David Alan Gilbert (5):
  net/announce: Allow optional list of interfaces
  net/announce: Add HMP optional interface list
  net/announce: Add optional ID
  net/announce: Add HMP optional ID
  net/announce: Expand test for stopping self announce

 hmp-commands.hx |  7 +++-
 hw/net/virtio-net.c |  4 +-
 include/net/announce.h  |  8 +++-
 monitor/hmp-cmds.c  | 41 ++-
 net/announce.c  | 89 +++--
 net/trace-events|  3 +-
 qapi/net.json   | 16 ++--
 tests/virtio-net-test.c | 57 --
 8 files changed, 198 insertions(+), 27 deletions(-)

-- 
2.21.0

Re: [Qemu-devel] [PATCH 07/12] block/backup: add 'always' bitmap sync policy

2019-06-20 Thread John Snow




On 6/20/19 1:00 PM, Max Reitz wrote:
> On 20.06.19 03:03, John Snow wrote:
>> This adds an "always" policy for bitmap synchronization. Regardless of if
>> the job succeeds or fails, the bitmap is *always* synchronized. This means
>> that for backups that fail part-way through, the bitmap retains a record of
>> which sectors need to be copied out to accomplish a new backup using the
>> old, partial result.
>>
>> In effect, this allows us to "resume" a failed backup; however the new backup
>> will be from the new point in time, so it isn't a "resume" as much as it is
>> an "incremental retry." This can be useful in the case of extremely large
>> backups that fail considerably through the operation and we'd like to not 
>> waste
>> the work that was already performed.
>>
>> Signed-off-by: John Snow 
>> ---
>>  qapi/block-core.json |  5 -
>>  block/backup.c   | 10 ++
>>  2 files changed, 10 insertions(+), 5 deletions(-)
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 0332dcaabc..58d267f1f5 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -1143,6 +1143,9 @@
>>  # An enumeration of possible behaviors for the synchronization of a bitmap
>>  # when used for data copy operations.
>>  #
>> +# @always: The bitmap is always synchronized with remaining blocks to copy,
>> +#  whether or not the operation has completed successfully or not.
>> +#
>>  # @conditional: The bitmap is only synchronized when the operation is 
>> successul.
>>  #   This is useful for Incremental semantics.
>>  #
>> @@ -1153,7 +1156,7 @@
>>  # Since: 4.1
>>  ##
>>  { 'enum': 'BitmapSyncMode',
>> -  'data': ['conditional', 'never'] }
>> +  'data': ['always', 'conditional', 'never'] }
>>  
>>  ##
>>  # @MirrorCopyMode:
>> diff --git a/block/backup.c b/block/backup.c
>> index 627f724b68..beb2078696 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
>> @@ -266,15 +266,17 @@ static void backup_cleanup_sync_bitmap(BackupBlockJob 
>> *job, int ret)
>>  BlockDriverState *bs = blk_bs(job->common.blk);
>>  
>>  if (ret < 0 || job->bitmap_mode == BITMAP_SYNC_MODE_NEVER) {
>> -/* Failure, or we don't want to synchronize the bitmap.
>> - * Merge the successor back into the parent, delete nothing. */
>> +/* Failure, or we don't want to synchronize the bitmap. */
>> +if (job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS) {
>> +bdrv_dirty_bitmap_claim(job->sync_bitmap, >copy_bitmap);
> 
> Hmm...  OK, bitmaps in backup always confuse me, so bear with me, please.
> 

I realize this is an extremely dense section that actually covers a
*lot* of pathways.

> (Hi, I’m a time traveler from the end of this section and I can tell you
> that everything is fine.  I was just confused.  I’ll still keep this
> here, because it was so much work.)
> 
> The copy_bitmap is copied from the sync_bitmap at the beginning, so the
> sync_bitmap can continue to be dirtied, but that won’t affect the job.
> In normal incremental mode, this means that the sync point is always at
> the beginning of the job.  (Well, naturally, because that’s how backup
> is supposed to go.)
> 

sync_bitmap: This is used as an initial manifest for which sectors to
copy out. It is the user-provided bitmap. We actually *never* edit this
bitmap in the body of the job.

copy_bitmap: This is the manifest for which blocks remain to be copied
out. We clear bits in this as we go, because we use it as our loop
condition.

So what you say is actually only half-true: the sync_bitmap actually
remains static during the duration of the job, and it has an anonymous
child that accrues new writes. This is a holdover from before we had a
copy_bitmap, and we used to use a sync_bitmap directly as our loop
condition.

(This could be simplified upstream at present; but after this patch it
cannot be for reasons explained below. We do wish to maintain three
distinct sets of bits:
1. The bits at the start of the operation,
2. The bits accrued during the operation, and
3. The bits that remain to be, or were not, copied during the operation.)

So there's actually three bitmaps:

- sync_bitmap: actually just static and read-only
- sync_bitmap's anonymous child: accrues new writes.
- copy_bitmap: loop conditional.

> But then replacing the sync_bitmap with the copy_bitmap here means that
> all of these dirtyings that happened during the job are lost.  Hmm, but
> that doesn’t matter, does it?  Because whenever something was dirtied in
> sync_bitmap, the corresponding area must have been copied to the backup
> due to the job.
> 

The new dirty bits were accrued very secretly in the anonymous child.
The new dirty bits are merged in via the reclaim() function.

So, what happens is:

- Sync_bitmap gets the bit pattern of copy_bitmap (one way or another)
- Sync_bitmap reclaims (merges with) its anonymous child.

> Ah, yes, it would actually be wrong to keep the new dirty bits, because
> in this mode,

[Qemu-devel] [RFC PATCH v1 10/12] target/i386: sev: add support to load incoming encrypted page

2019-06-20 Thread Singh, Brijesh

The sev_load_incoming_page() provide the implementation to read the
incoming guest private pages from the socket and load it into the guest
memory. The routines uses the RECEIVE_START command to create the
incoming encryption context on the first call then uses the
RECEIEVE_UPDATE_DATA command to load the encrypted pages into the guest
memory. After migration is completed, we issue the RECEIVE_FINISH command
to transition the SEV guest to the runnable state so that it can be
executed.

Signed-off-by: Brijesh Singh 
---
 accel/kvm/kvm-all.c  |   1 +
 target/i386/sev.c| 126 ++-
 target/i386/trace-events |   3 +
 3 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 85d6508e7f..fe65c8eb5d 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1785,6 +1785,7 @@ static int kvm_init(MachineState *ms)
 kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
 kvm_state->memcrypt_sync_page_enc_bitmap = sev_sync_page_enc_bitmap;
 kvm_state->memcrypt_save_outgoing_page = sev_save_outgoing_page;
+kvm_state->memcrypt_load_incoming_page = sev_load_incoming_page;
 }
 
 ret = kvm_arch_init(ms, s);
diff --git a/target/i386/sev.c b/target/i386/sev.c
index b5aa53ec44..b7feedce7d 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -708,13 +708,34 @@ sev_launch_finish(SEVState *s)
 }
 }
 
+static int
+sev_receive_finish(SEVState *s)
+{
+int error, ret = 1;
+
+trace_kvm_sev_receive_finish();
+ret = sev_ioctl(s->sev_fd, KVM_SEV_RECEIVE_FINISH, 0, );
+if (ret) {
+error_report("%s: RECEIVE_FINISH ret=%d fw_error=%d '%s'",
+__func__, ret, error, fw_error_to_str(error));
+goto err;
+}
+
+sev_set_guest_state(SEV_STATE_RUNNING);
+err:
+return ret;
+}
+
+
 static void
 sev_vm_state_change(void *opaque, int running, RunState state)
 {
 SEVState *s = opaque;
 
 if (running) {
-if (!sev_check_state(SEV_STATE_RUNNING)) {
+if (sev_check_state(SEV_STATE_RECEIVE_UPDATE)) {
+sev_receive_finish(s);
+} else if (!sev_check_state(SEV_STATE_RUNNING)) {
 sev_launch_finish(s);
 }
 }
@@ -1092,6 +1113,109 @@ int sev_save_outgoing_page(void *handle, QEMUFile *f, 
uint8_t *ptr,
 return sev_send_update_data(s, f, ptr, sz, bytes_sent);
 }
 
+static int
+sev_receive_start(QSevGuestInfo *sev, QEMUFile *f)
+{
+int ret = 1;
+int fw_error;
+struct kvm_sev_receive_start *start;
+gchar *session = NULL, *pdh_cert = NULL;
+
+start = g_new0(struct kvm_sev_receive_start, 1);
+
+/* get SEV guest handle */
+start->handle = object_property_get_int(OBJECT(sev), "handle",
+_abort);
+
+/* get the source policy */
+start->policy = qemu_get_be32(f);
+
+/* get source PDH key */
+start->pdh_len = qemu_get_be32(f);
+pdh_cert = g_new(gchar, start->pdh_len);
+qemu_get_buffer(f, (uint8_t *)pdh_cert, start->pdh_len);
+start->pdh_uaddr = (unsigned long)pdh_cert;
+
+/* get source session data */
+start->session_len = qemu_get_be32(f);
+session = g_new(gchar, start->session_len);
+qemu_get_buffer(f, (uint8_t *)session, start->session_len);
+start->session_uaddr = (unsigned long)session;
+
+trace_kvm_sev_receive_start(start->policy, session, pdh_cert);
+
+ret = sev_ioctl(sev_state->sev_fd, KVM_SEV_RECEIVE_START, start, 
_error);
+if (ret < 0) {
+error_report("Error RECEIVE_START ret=%d fw_error=%d '%s'",
+ret, fw_error, fw_error_to_str(fw_error));
+goto err;
+}
+
+object_property_set_int(OBJECT(sev), start->handle, "handle", 
_abort);
+sev_set_guest_state(SEV_STATE_RECEIVE_UPDATE);
+err:
+g_free(start);
+g_free(session);
+g_free(pdh_cert);
+
+return ret;
+}
+
+static int sev_receive_update_data(QEMUFile *f, uint8_t *ptr)
+{
+int ret = 1, fw_error = 0;
+gchar *hdr = NULL, *trans = NULL;
+struct kvm_sev_receive_update_data *update;
+
+update = g_new0(struct kvm_sev_receive_update_data, 1);
+
+/* get packet header */
+update->hdr_len = qemu_get_be32(f);
+hdr = g_new(gchar, update->hdr_len);
+qemu_get_buffer(f, (uint8_t *)hdr, update->hdr_len);
+update->hdr_uaddr = (unsigned long)hdr;
+
+/* get transport buffer */
+update->trans_len = qemu_get_be32(f);
+trans = g_new(gchar, update->trans_len);
+update->trans_uaddr = (unsigned long)trans;
+qemu_get_buffer(f, (uint8_t *)update->trans_uaddr, update->trans_len);
+
+update->guest_uaddr = (unsigned long) ptr;
+update->guest_len = update->trans_len;
+
+trace_kvm_sev_receive_update_data(trans, ptr, update->guest_len,
+hdr, update->hdr_len);
+
+ret = sev_ioctl(sev_state->sev_fd, KVM_SEV_RECEIVE_UPDATE_DATA,
+update, _error);
+if (ret) {
+error_report("Error RECEIVE_UPDATE_DATA ret=%d

[Qemu-devel] [PATCH v5 3/5] net/announce: Add optional ID

2019-06-20 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Previously there was a single instance of the timer used by
monitor triggered announces, that's OK, but when combined with the
previous change that lets you have announces for subsets of interfaces
it's a bit restrictive if you want to do different things to different
interfaces.

Add an 'id' field to the announce, and maintain a list of the
timers based on id.

This allows you to for example:
a) Start an announce going on interface eth0 for a long time
b) Start an announce going on interface eth1 for a long time
c) Kill the announce on eth0 while leaving eth1 going.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/net/virtio-net.c|  4 ++--
 include/net/announce.h |  8 +--
 net/announce.c | 52 --
 net/trace-events   |  3 ++-
 qapi/net.json  |  9 ++--
 5 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index c3f5fccfd1..b9e1cd71cf 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -2360,7 +2360,7 @@ static int virtio_net_post_load_device(void *opaque, int 
version_id)
 timer_mod(n->announce_timer.tm,
   qemu_clock_get_ms(n->announce_timer.type));
 } else {
-qemu_announce_timer_del(>announce_timer);
+qemu_announce_timer_del(>announce_timer, false);
 }
 }
 
@@ -2784,7 +2784,7 @@ static void virtio_net_device_unrealize(DeviceState *dev, 
Error **errp)
 virtio_net_del_queue(n, i);
 }
 
-qemu_announce_timer_del(>announce_timer);
+qemu_announce_timer_del(>announce_timer, false);
 g_free(n->vqs);
 qemu_del_nic(n->nic);
 virtio_net_rsc_cleanup(n);
diff --git a/include/net/announce.h b/include/net/announce.h
index 773470428b..3d90c83c23 100644
--- a/include/net/announce.h
+++ b/include/net/announce.h
@@ -22,8 +22,12 @@ struct AnnounceTimer {
 /* Returns: update the timer to the next time point */
 int64_t qemu_announce_timer_step(AnnounceTimer *timer);
 
-/* Delete the underlying timer and other data */
-void qemu_announce_timer_del(AnnounceTimer *timer);
+/*
+ * Delete the underlying timer and other data
+ * If 'free_named' true and the timer is a named timer, then remove
+ * it from the list of named timers and free the AnnounceTimer itself.
+ */
+void qemu_announce_timer_del(AnnounceTimer *timer, bool free_named);
 
 /*
  * Under BQL/main thread
diff --git a/net/announce.c b/net/announce.c
index 1ce42b571d..db90d3bd4b 100644
--- a/net/announce.c
+++ b/net/announce.c
@@ -15,6 +15,8 @@
 #include "qapi/qapi-commands-net.h"
 #include "trace.h"
 
+static GData *named_timers;
+
 int64_t qemu_announce_timer_step(AnnounceTimer *timer)
 {
 int64_t step;
@@ -31,8 +33,13 @@ int64_t qemu_announce_timer_step(AnnounceTimer *timer)
 return step;
 }
 
-void qemu_announce_timer_del(AnnounceTimer *timer)
+/*
+ * If 'free_named' is true, then remove the timer from the list
+ * and free the timer itself.
+ */
+void qemu_announce_timer_del(AnnounceTimer *timer, bool free_named)
 {
+bool free_timer = false;
 if (timer->tm) {
 timer_del(timer->tm);
 timer_free(timer->tm);
@@ -40,6 +47,24 @@ void qemu_announce_timer_del(AnnounceTimer *timer)
 }
 qapi_free_strList(timer->params.interfaces);
 timer->params.interfaces = NULL;
+if (free_named && timer->params.has_id) {
+AnnounceTimer *list_timer;
+/*
+ * Sanity check: There should only be one timer on the list with
+ * the id.
+ */
+list_timer = g_datalist_get_data(_timers, timer->params.id);
+assert(timer == list_timer);
+free_timer = true;
+g_datalist_remove_data(_timers, timer->params.id);
+}
+trace_qemu_announce_timer_del(free_named, free_timer, timer->params.id);
+g_free(timer->params.id);
+timer->params.id = NULL;
+
+if (free_timer) {
+g_free(timer);
+}
 }
 
 /*
@@ -56,7 +81,7 @@ void qemu_announce_timer_reset(AnnounceTimer *timer,
  * We're under the BQL, so the current timer can't
  * be firing, so we should be able to delete it.
  */
-qemu_announce_timer_del(timer);
+qemu_announce_timer_del(timer, false);
 
 QAPI_CLONE_MEMBERS(AnnounceParameters, >params, params);
 timer->round = params->rounds;
@@ -120,7 +145,8 @@ static void qemu_announce_self_iter(NICState *nic, void 
*opaque)
 skip = false;
 }
 
-trace_qemu_announce_self_iter(nic->ncs->name,
+trace_qemu_announce_self_iter(timer->params.has_id ? timer->params.id : 
"_",
+  nic->ncs->name,
   qemu_ether_ntoa(>conf->macaddr), skip);
 
 if (!skip) {
@@ -143,7 +169,7 @@ static void qemu_announce_self_once(void *opaque)
 if (--timer->round) {
 qemu_announce_timer_step(timer);
 } else {
-qemu_announce_timer_del(timer);
+qemu_announce_timer_del(timer,

Re: [Qemu-devel] [PATCH 02/12] block/backup: Add mirror sync mode 'bitmap'

2019-06-20 Thread Max Reitz

On 20.06.19 18:01, John Snow wrote:
> 
> 
> On 6/20/19 11:00 AM, Max Reitz wrote:
>> On 20.06.19 03:03, John Snow wrote:
>>> We don't need or want a new sync mode for simple differences in
>>> semantics.  Create a new mode simply named "BITMAP" that is designed to
>>> make use of the new Bitmap Sync Mode field.
>>>
>>> Because the only bitmap mode is 'conditional', this adds no new
>>> functionality to the backup job (yet). The old incremental backup mode
>>> is maintained as a syntactic sugar for sync=bitmap, mode=conditional.
>>>
>>> Add all of the plumbing necessary to support this new instruction.
>>>
>>> Signed-off-by: John Snow 
>>> ---
>>>  qapi/block-core.json  | 30 ++
>>>  include/block/block_int.h |  6 +-
>>>  block/backup.c| 35 ---
>>>  block/mirror.c|  6 --
>>>  block/replication.c   |  2 +-
>>>  blockdev.c|  8 ++--
>>>  6 files changed, 66 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index caf28a71a0..6d05ad8f47 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -1127,12 +1127,15 @@
>>>  #
>>>  # @none: only copy data written from now on
>>>  #
>>> -# @incremental: only copy data described by the dirty bitmap. Since: 2.4
>>> +# @incremental: only copy data described by the dirty bitmap. (since: 2.4)
>>
>> Why not deprecate this in the process and note that this is equal to
>> sync=bitmap, bitmap-mode=conditional?
>>
>> (I don’t think there is a rule that forces us to actually remove
>> deprecated stuff after two releases if it doesn’t hurt to keep it.)
>>
> 
> Mostly I thought it would be fine to keep as sugar. In your replies so
> far I gather that "incremental" and "differential" don't mean specific
> backup paradigms to you, so maybe these seem like worthless words.
> 
> It was my general understanding that in terms of backup
> paradigms/methodologies that "incremental" and "differential" mean very
> specific things.
> 
> Incremental: Each backup contains only the delta from the last
> incremental backup.
> Differential: Each backup contains the delta from the last FULL backup.
> 
> You can search "incremental vs differential backup" on your search
> engine of choice and find many relevant results. I took a Networking/IT
> vocational degree in 2007 and these terms were taught in textbooks then.
> 
> So I will resist quite strongly changing them, and for this reason, felt
> that it was strictly a good thing to keep incremental as sugar, because
> I thought that people would know what it meant.

:C

OK.  I’m happy as long as it’s all explained somewhere (i.e.
bitmaps.rst).  Personally, I’d also like a pointer to that documentation
here.  (Sure, people should just look there if they don’t understand
something about bitmaps anyway, but I can’t see it hurting to just put a
pointer here anyway.)

> (More than "conditional", anyway, which is jargon I made up.)

But you make it up in this series, which is great for me, because that
means I get the definition (from the cover letter) without having to
look it up. O:-)

[...]

>>>  #
>>> +# @bitmap-mode: Specifies the type of data the bitmap should contain after
>>> +#   the operation concludes. Must be present if sync is 
>>> "bitmap".
>>> +#   Must NOT be present otherwise. (Since 4.1)
>>
>> Do we have any rule that qemu must enforce “must not”s? :-)
>>
>> (No, I don’t think so.  I think it’s very reasonable that you accept
>> bitmap-mode=conditional for sync=incremental.)
>>
> 
> Right, I left this a secret wiggle room. If you specify the correct
> bitmap sync mode for the incremental sugar, it will actually let it
> slide. If you specify the wrong one, it will error out.
> 
> However, I think this is perfectly correct advice from the API: Please
> use this mode with sync=bitmap and do not use it otherwise.
> 
> Would you like me to change it to be more technically correct and
> document the little affordance I made?

It’s probably better not to.  Better forbid as much as we can so that we
can break compatibility to users that happened to use it still “because
it works”.

Max



signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [RFC PATCH v1 03/12] migration/ram: add support to send encrypted pages

2019-06-20 Thread Singh, Brijesh

When memory encryption is enabled, the guest memory will be encrypted with
the guest specific key. The patch introduces RAM_SAVE_FLAG_ENCRYPTED_PAGE
flag to distinguish the encrypted data from plaintext. Encrypted pages
may need special handling. The kvm_memcrypt_save_outgoing_page() is used
by the sender to write the encrypted pages onto the socket, similarly the
kvm_memcrypt_load_incoming_page() is used by the target to read the
encrypted pages from the socket and load into the guest memory.

Signed-off-by: Brijesh Singh 
---
 migration/ram.c | 54 -
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 908517fc2b..3c8977d508 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -57,6 +57,7 @@
 #include "qemu/uuid.h"
 #include "savevm.h"
 #include "qemu/iov.h"
+#include "sysemu/kvm.h"
 
 /***/
 /* ram save/restore */
@@ -76,6 +77,7 @@
 #define RAM_SAVE_FLAG_XBZRLE   0x40
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
+#define RAM_SAVE_FLAG_ENCRYPTED_PAGE   0x200
 
 static inline bool is_zero_range(uint8_t *p, uint64_t size)
 {
@@ -460,6 +462,9 @@ static QemuCond decomp_done_cond;
 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
  ram_addr_t offset, uint8_t *source_buf);
 
+static int ram_save_encrypted_page(RAMState *rs, PageSearchStatus *pss,
+   bool last_stage);
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2006,6 +2011,36 @@ static int ram_save_multifd_page(RAMState *rs, RAMBlock 
*block,
 return 1;
 }
 
+/**
+ * ram_save_encrypted_page - send the given encrypted page to the stream
+ */
+static int ram_save_encrypted_page(RAMState *rs, PageSearchStatus *pss,
+   bool last_stage)
+{
+int ret;
+uint8_t *p;
+RAMBlock *block = pss->block;
+ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
+uint64_t bytes_xmit;
+
+p = block->host + offset;
+
+ram_counters.transferred +=
+save_page_header(rs, rs->f, block,
+offset | RAM_SAVE_FLAG_ENCRYPTED_PAGE);
+
+ret = kvm_memcrypt_save_outgoing_page(rs->f, p,
+TARGET_PAGE_SIZE, _xmit);
+if (ret) {
+return -1;
+}
+
+ram_counters.transferred += bytes_xmit;
+ram_counters.normal++;
+
+return 1;
+}
+
 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
  ram_addr_t offset, uint8_t *source_buf)
 {
@@ -2450,6 +2485,16 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss,
 return res;
 }
 
+/*
+ * If memory encryption is enabled then use memory encryption APIs
+ * to write the outgoing buffer to the wire. The encryption APIs
+ * will take care of accessing the guest memory and re-encrypt it
+ * for the transport purposes.
+ */
+ if (kvm_memcrypt_enabled()) {
+return ram_save_encrypted_page(rs, pss, last_stage);
+ }
+
 if (save_compress_page(rs, block, offset)) {
 return 1;
 }
@@ -4271,7 +4316,8 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 
 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
- RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+ RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE |
+ RAM_SAVE_FLAG_ENCRYPTED_PAGE)) {
 RAMBlock *block = ram_block_from_stream(f, flags);
 
 /*
@@ -4391,6 +4437,12 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 break;
 }
 break;
+case RAM_SAVE_FLAG_ENCRYPTED_PAGE:
+if (kvm_memcrypt_load_incoming_page(f, host)) {
+error_report("Failed to encrypted incoming data");
+ret = -EINVAL;
+}
+break;
 case RAM_SAVE_FLAG_EOS:
 /* normal exit */
 multifd_recv_sync_main();
-- 
2.17.1

Re: [Qemu-devel] [PATCH 12/12] block/backup: loosen restriction on readonly bitmaps

2019-06-20 Thread Max Reitz

On 20.06.19 03:03, John Snow wrote:
> With the "never" sync policy, we actually can utilize readonly bitmaps
> now. Loosen the check at the QMP level, and tighten it based on
> provided arguments down at the job creation level instead.
> 
> Signed-off-by: John Snow 
> ---
>  block/backup.c | 6 ++
>  blockdev.c | 4 ++--
>  2 files changed, 8 insertions(+), 2 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [RFC PATCH v1 01/12] linux-headers: update kernel header to include SEV migration commands

2019-06-20 Thread Singh, Brijesh

Signed-off-by: Brijesh Singh 
---
 linux-headers/linux/kvm.h | 53 +++
 1 file changed, 53 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index c8423e760c..2bdd6a908e 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -492,6 +492,16 @@ struct kvm_dirty_log {
};
 };
 
+/* for KVM_GET_PAGE_ENC_BITMAP */
+struct kvm_page_enc_bitmap {
+__u64 start;
+__u64 num_pages;
+   union {
+   void *enc_bitmap; /* one bit per page */
+   __u64 padding2;
+   };
+};
+
 /* for KVM_CLEAR_DIRTY_LOG */
 struct kvm_clear_dirty_log {
__u32 slot;
@@ -1451,6 +1461,9 @@ struct kvm_enc_region {
 /* Available with KVM_CAP_ARM_SVE */
 #define KVM_ARM_VCPU_FINALIZE_IOW(KVMIO,  0xc2, int)
 
+#define KVM_GET_PAGE_ENC_BITMAP _IOW(KVMIO, 0xc2, struct 
kvm_page_enc_bitmap)
+#define KVM_SET_PAGE_ENC_BITMAP _IOW(KVMIO, 0xc3, struct 
kvm_page_enc_bitmap)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
/* Guest initialization commands */
@@ -1531,6 +1544,46 @@ struct kvm_sev_dbg {
__u32 len;
 };
 
+struct kvm_sev_send_start {
+   __u32 policy;
+   __u64 pdh_cert_uaddr;
+   __u32 pdh_cert_len;
+   __u64 plat_cert_uaddr;
+   __u32 plat_cert_len;
+   __u64 amd_cert_uaddr;
+   __u32 amd_cert_len;
+   __u64 session_uaddr;
+   __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+   __u64 hdr_uaddr;
+   __u32 hdr_len;
+   __u64 guest_uaddr;
+   __u32 guest_len;
+   __u64 trans_uaddr;
+   __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+   __u32 handle;
+   __u32 policy;
+   __u64 pdh_uaddr;
+   __u32 pdh_len;
+   __u64 session_uaddr;
+   __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+   __u64 hdr_uaddr;
+   __u32 hdr_len;
+   __u64 guest_uaddr;
+   __u32 guest_len;
+   __u64 trans_uaddr;
+   __u32 trans_len;
+};
+
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX   (1 << 2)
-- 
2.17.1

Re: [Qemu-devel] [Qemu-riscv] [RFC v1 0/5] RISC-V: Add firmware loading support and default

2019-06-20 Thread David Abdurachmanov

On Thu, Jun 20, 2019 at 9:18 PM Alistair Francis  wrote:
>
> On Thu, Jun 20, 2019 at 1:16 AM Andrea Bolognani  wrote:
> >
> > On Wed, 2019-06-19 at 11:23 -0700, Alistair Francis wrote:
> > > On Wed, Jun 19, 2019 at 7:42 AM Bin Meng  wrote:
> > > > On Wed, Jun 19, 2019 at 10:30 PM Alistair Francis 
> > > >  wrote:
> > > > > On Wed, Jun 19, 2019 at 7:26 AM Bin Meng  wrote:
> > > > > > >  pc-bios/opensbi-riscv32-fw_jump.elf | Bin 0 -> 197988 bytes
> > > > > > >  pc-bios/opensbi-riscv64-fw_jump.elf | Bin 0 -> 200192 bytes
> > > > > >
> > > > > > Since we are considering adding "bios" images, I prefer to add the
> > > > > > pure binary images instead of ELF images here.
> > > > >
> > > > > I didn't think about that. Can we just boot them in QEMU like we do
> > > > > with the ELFs?
> > > >
> > > > Yes, use load_image_targphys() instead of load_elf().
> > >
> > > Ah, that is obvious. I'll update it to use the bin files then.
> >
> > I'm unclear on the advantages of using one format over the other,
>
> The main one that I see is that everyone else is already using .bin
> and no one else is using .elf.
>
> > but one question comes to mind: once this is in, we will probably
> > want to have OpenSBI packaged separately in distributions, the same
> > way it already happens for SeaBIOS, SLOF and edk2-based firmwares.
> >
> > Will using either of the formats prevent that from happening?
>
> Both options allow this.
>
> OE-Core already packages OpenSBI by default, Fedora and Debian are
> moving to OpenSBI for RISC-V targets as well.

Fedora uses OpenSBI for the last 2 or 3 months now. I don't plan to update
BBL builds. OpenSBI packages in Fedora/RISCV isn't finalized, but it does
ship *.elf and *.bin files.
>
> Any distro that supports the RISC-V toolchain (which is all
> upstreamed) can build OpenSBI.
>
> Alistair
>
> >
> > --
> > Andrea Bolognani / Red Hat / Virtualization
> >
>

Re: [Qemu-devel] [PATCH 11/12] iotests: add test 257 for bitmap-mode backups

2019-06-20 Thread Max Reitz

On 20.06.19 03:03, John Snow wrote:
> Signed-off-by: John Snow 
> ---
>  tests/qemu-iotests/257 |  412 +++
>  tests/qemu-iotests/257.out | 2199 
>  tests/qemu-iotests/group   |1 +
>  3 files changed, 2612 insertions(+)
>  create mode 100755 tests/qemu-iotests/257
>  create mode 100644 tests/qemu-iotests/257.out

This test is actually quite nicely written.

I like that I don’t have to read the reference output but can just grep
for “error”.

Only minor notes below.

> diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257
> new file mode 100755
> index 00..5f7f388504
> --- /dev/null
> +++ b/tests/qemu-iotests/257

[...]

> +class PatternGroup:
> +"""Grouping of Pattern objects. Initialize with an iterable of 
> Patterns."""
> +def __init__(self, patterns):
> +self.patterns = patterns
> +
> +def bits(self, granularity):
> +"""Calculate the unique bits dirtied by this pattern grouping"""
> +res = set()
> +for pattern in self.patterns:
> +lower = math.floor(pattern.offset / granularity)
> +upper = math.floor((pattern.offset + pattern.size - 1) / 
> granularity)
> +res = res | set(range(lower, upper + 1))

Why you’d do floor((x - 1) / y) + 1 has confused me quite a while.
Until I realized that oh yeah, Python’s range() is a right-open
interval.  I don’t like Python’s range().

(Yes, you’re right, this is better to read than just ceil(x / y).
Because it reminds people like me that range() is weird.)

> +return res
> +
> +GROUPS = [
> +PatternGroup([
> +# Batch 0: 4 clusters
> +mkpattern('0x49', 0x000),
> +mkpattern('0x6c', 0x010),   # 1M
> +mkpattern('0x6f', 0x200),   # 32M
> +mkpattern('0x76', 0x3ff)]), # 64M - 64K
> +PatternGroup([
> +# Batch 1: 6 clusters (3 new)
> +mkpattern('0x65', 0x000),   # Full overwrite
> +mkpattern('0x77', 0x00f8000),   # Partial-left (1M-32K)
> +mkpattern('0x72', 0x2008000),   # Partial-right (32M+32K)
> +mkpattern('0x69', 0x3fe)]), # Adjacent-left (64M - 128K)
> +PatternGroup([
> +# Batch 2: 7 clusters (3 new)
> +mkpattern('0x74', 0x001),   # Adjacent-right
> +mkpattern('0x69', 0x00e8000),   # Partial-left  (1M-96K)
> +mkpattern('0x6e', 0x2018000),   # Partial-right (32M+96K)
> +mkpattern('0x67', 0x3fe,
> +  2*GRANULARITY)]), # Overwrite [(64M-128K)-64M)
> +PatternGroup([
> +# Batch 3: 8 clusters (5 new)
> +# Carefully chosen such that nothing re-dirties the one cluster
> +# that copies out successfully before failure in Group #1.
> +mkpattern('0xaa', 0x001,
> +  3*GRANULARITY),   # Overwrite and 2x Adjacent-right
> +mkpattern('0xbb', 0x00d8000),   # Partial-left (1M-160K)
> +mkpattern('0xcc', 0x2028000),   # Partial-right (32M+160K)
> +mkpattern('0xdd', 0x3fc)]), # New; leaving a gap to the right
> +]

I’d place this four spaces to the left.  But maybe placing it here is
proper Python indentation, while moving it to the left would be C
indentation.

> +class Drive:
> +"""Represents, vaguely, a drive attached to a VM.
> +Includes format, graph, and device information."""
> +
> +def __init__(self, path, vm=None):
> +self.path = path
> +self.vm = vm
> +self.fmt = None
> +self.size = None
> +self.node = None
> +self.device = None
> +
> +@property
> +def name(self):
> +return self.node or self.device
> +
> +def img_create(self, fmt, size):
> +self.fmt = fmt
> +self.size = size
> +iotests.qemu_img_create('-f', self.fmt, self.path, str(self.size))
> +
> +def create_target(self, name, fmt, size):
> +basename = os.path.basename(self.path)
> +file_node_name = "file_{}".format(basename)
> +vm = self.vm
> +
> +log(vm.command('blockdev-create', job_id='bdc-file-job',
> +   options={
> +   'driver': 'file',
> +   'filename': self.path,
> +   'size': 0,
> +   }))
> +vm.run_job('bdc-file-job')
> +log(vm.command('blockdev-add', driver='file',
> +   node_name=file_node_name, filename=self.path))
> +
> +log(vm.command('blockdev-create', job_id='bdc-fmt-job',
> +   options={
> +   'driver': fmt,
> +   'file': file_node_name,
> +   'size': size,
> +   }))
> +vm.run_job('bdc-fmt-job')
> +log(vm.command('blockdev-add', driver=fmt,
> +   node_name=name,
> +   file=file_node_name))
> +self.fmt = fmt
> +self.size =

[Qemu-devel] [RFC PATCH v1 09/12] target/i386: sev: add support to encrypt the outgoing page

2019-06-20 Thread Singh, Brijesh

The sev_save_outgoing_page() provide the implementation to encrypt the
guest private pages during the transit. The routines uses the SEND_START
command to create the outgoing encryption context on the first call then
uses the SEND_UPDATE_DATA command to encrypt the data before writing it
to the socket. While encrypting the data SEND_UPDATE_DATA produces some
metadata (e.g MAC, IV). The metadata is also sent to the target machine.
After migration is completed, we issue the SEND_FINISH command to transition
the SEV guest state from sending to unrunnable state.

Signed-off-by: Brijesh Singh 
---
 accel/kvm/kvm-all.c  |   1 +
 target/i386/sev.c| 229 +++
 target/i386/sev_i386.h   |   2 +
 target/i386/trace-events |   3 +
 4 files changed, 235 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 0654d9a7cd..85d6508e7f 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1784,6 +1784,7 @@ static int kvm_init(MachineState *ms)
 
 kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
 kvm_state->memcrypt_sync_page_enc_bitmap = sev_sync_page_enc_bitmap;
+kvm_state->memcrypt_save_outgoing_page = sev_save_outgoing_page;
 }
 
 ret = kvm_arch_init(ms, s);
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 2c7c496593..b5aa53ec44 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -27,6 +27,8 @@
 #include "sysemu/sysemu.h"
 #include "trace.h"
 #include "migration/blocker.h"
+#include "migration/qemu-file.h"
+#include "migration/misc.h"
 
 #define DEFAULT_GUEST_POLICY0x1 /* disable debug */
 #define DEFAULT_SEV_DEVICE  "/dev/sev"
@@ -718,6 +720,39 @@ sev_vm_state_change(void *opaque, int running, RunState 
state)
 }
 }
 
+static void
+sev_send_finish(void)
+{
+int ret, error;
+
+trace_kvm_sev_send_finish();
+ret = sev_ioctl(sev_state->sev_fd, KVM_SEV_SEND_FINISH, 0, );
+if (ret) {
+error_report("%s: LAUNCH_FINISH ret=%d fw_error=%d '%s'",
+ __func__, ret, error, fw_error_to_str(error));
+}
+
+sev_set_guest_state(SEV_STATE_RUNNING);
+}
+
+static void
+sev_migration_state_notifier(Notifier *notifier, void *data)
+{
+MigrationState *s = data;
+
+if (migration_has_finished(s) ||
+migration_in_postcopy_after_devices(s) ||
+migration_has_failed(s)) {
+if (sev_check_state(SEV_STATE_SEND_UPDATE)) {
+sev_send_finish();
+}
+}
+}
+
+static Notifier sev_migration_state_notify = {
+.notify = sev_migration_state_notifier,
+};
+
 void *
 sev_guest_init(const char *id)
 {
@@ -804,6 +839,7 @@ sev_guest_init(const char *id)
 ram_block_notifier_add(_ram_notifier);
 qemu_add_machine_init_done_notifier(_machine_done_notify);
 qemu_add_vm_change_state_handler(sev_vm_state_change, s);
+add_migration_state_change_notifier(_migration_state_notify);
 
 return s;
 err:
@@ -863,6 +899,199 @@ void sev_set_migrate_info(const char *pdh, const char 
*plat_cert,
 s->amd_cert = g_base64_decode(amd_cert, >amd_cert_len);
 }
 
+static int
+sev_get_send_session_length(void)
+{
+int ret, fw_err = 0;
+struct kvm_sev_send_start *start;
+
+start = g_new0(struct kvm_sev_send_start, 1);
+
+ret = sev_ioctl(sev_state->sev_fd, KVM_SEV_SEND_START, start, _err);
+if (fw_err != SEV_RET_INVALID_LEN) {
+ret = -1;
+error_report("%s: failed to get session length ret=%d fw_error=%d 
'%s'",
+ __func__, ret, fw_err, fw_error_to_str(fw_err));
+goto err;
+}
+
+ret = start->session_len;
+err:
+g_free(start);
+return ret;
+}
+
+static int
+sev_send_start(SEVState *s, QEMUFile *f, uint64_t *bytes_sent)
+{
+gsize pdh_len = 0, plat_cert_len;
+int session_len, ret, fw_error;
+struct kvm_sev_send_start *start;
+guchar *pdh = NULL, *plat_cert = NULL, *session = NULL;
+
+if (!s->remote_pdh || !s->remote_plat_cert) {
+error_report("%s: missing remote PDH or PLAT_CERT", __func__);
+return 1;
+}
+
+start = g_new0(struct kvm_sev_send_start, 1);
+
+start->pdh_cert_uaddr = (unsigned long) s->remote_pdh;
+start->pdh_cert_len = s->remote_pdh_len;
+
+start->plat_cert_uaddr = (unsigned long)s->remote_plat_cert;
+start->plat_cert_len = s->remote_plat_cert_len;
+
+start->amd_cert_uaddr = (unsigned long)s->amd_cert;
+start->amd_cert_len = s->amd_cert_len;
+
+/* get the session length */
+session_len = sev_get_send_session_length();
+if (session_len < 0) {
+ret = 1;
+goto err;
+}
+
+session = g_new0(guchar, session_len);
+start->session_uaddr = (unsigned long)session;
+start->session_len = session_len;
+
+/* Get our PDH certificate */
+ret = sev_get_pdh_info(s->sev_fd, , _len,
+   _cert, _cert_len);
+if (ret) {
+error_report("Failed to get our PDH cert");
+goto err;
+}
+
+

[Qemu-devel] [RFC PATCH v1 06/12] doc: update AMD SEV to include Live migration flow

2019-06-20 Thread Singh, Brijesh

Signed-off-by: Brijesh Singh 
---
 docs/amd-memory-encryption.txt | 44 +-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/docs/amd-memory-encryption.txt b/docs/amd-memory-encryption.txt
index abb9a976f5..757e0d931a 100644
--- a/docs/amd-memory-encryption.txt
+++ b/docs/amd-memory-encryption.txt
@@ -89,7 +89,49 @@ TODO
 
 Live Migration
 
-TODO
+AMD SEV encrypts the memory of VMs and because of this encryption is done
+using an address tweak, the hypervisor will not be able to simply copy the
+ciphertext between machines to migrate a VM. Instead the AMD SEV Key
+Management API provides a set of function which the hypervisor can use
+to package a guest page for migration, while maintaining the confidentiality
+provided by the AMD SEV.
+
+SEV guest VMs have the concept of private and shared memory. The private
+memory is encrypted with the guest-specific key, while shared memory may
+be encrypted with the hypervisor key. The migration APIs provided by the
+SEV API spec should be used migrating the private pages. The
+KVM_GET_PAGE_ENC_BITMAP ioctl can be used to get the guest page state
+bitmap. The bitmap can be used to check if the given guest page is
+private or shared.
+
+Before initiating the migration, we need to know the targets public
+Diffie-Hellman key (PDH) and certificate chain. It can retrieved
+with 'query-sev-capabilities' QMP or using the sev-tool. The
+migrate-set-sev-info object can be used to pass the targets PDH and
+certificate chain.
+
+e.g
+(QMP) migrate-sev-set-info pdh= plat-cert= \
+   amd-cert=
+(QMP) migrate tcp:0:
+
+Note: AMD cert contain be obtained from developer.amd.com/sev.
+
+During the migration flow, on source hypervisor SEND_START is called first
+to create outgoing encryption context. Based on the SEV guest policy, the
+certificated passed through the migrate-sev-set-info will be validated
+before creating the encryption context. The SEND_UPDATE_DATA is called
+to encrypt the guest private pages. After the migration is completed the
+SEND_FINISH is called to destroy the encryption context and make the VM
+non runnable to protect it against the cloning.
+
+On target hypevisor, the RECEIVE_START is called first to create an
+incoming encryption context. The RECEIVE_UPDATE_DATA is called to copy
+the received encrypted page into guest memory. After migration of
+pages is completed, RECEIVE_FINISH is called to make the VM runnable.
+
+For more information about the migration see SEV API Appendix A
+Usage flow (Live migration section).
 
 References
 -
-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 02/12] kvm: introduce high-level API to support encrypted guest migration

2019-06-20 Thread Singh, Brijesh

When memory encryption is enabled in VM, the guest pages will be
encrypted with the guest-specific key, to protect the confidentiality
of data in transit. To support the live migration we need to use
platform specific hooks to access the guest memory.

The kvm_memcrypt_save_outgoing_page() can be used by the sender to write
the encrypted pages and metadata associated with it on the socket.

The kvm_memcrypt_load_incoming_page() can be used by receiver to read the
incoming encrypted pages from the socket and load into the guest memory.

Encrypted VMs have concept of private and shared memory. The private
memory is encrypted with the guest-specific key, while shared memory
may be encrypted with hyperivosr key. The KVM_{SET,GET}_PAGE_ENC_BITMAP
ioctl can be used to get/set the bitmap from/to the hypervisor.

The kvm_memcrypt_sync_page_enc_bitmap() can be used by the sender to get
the page encryption bitmap. The bitmap is used to determine the page state
(private or shared).

The kvm_memcrypt_send_outgoing_page_enc_bitmap() can be used by the sender
to write the page encryption bitmap on the socket.

The kvm_memcrypt_load_incoming_page_enc_bitmap() can be used by the
receiver to read the page encryption bitmap from the socket.

Signed-off-by: Brijesh Singh <>
---
 accel/kvm/kvm-all.c| 68 ++
 accel/kvm/sev-stub.c   | 28 +
 accel/stubs/kvm-stub.c | 30 +++
 include/sysemu/kvm.h   | 33 
 include/sysemu/sev.h   |  9 ++
 5 files changed, 168 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index b0c4bed6e3..4d5ff8b9f5 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -109,6 +109,15 @@ struct KVMState
 /* memory encryption */
 void *memcrypt_handle;
 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
+int (*memcrypt_save_outgoing_page)(void *ehandle, QEMUFile *f,
+uint8_t *ptr, uint32_t sz, uint64_t *bytes_sent);
+int (*memcrypt_load_incoming_page)(void *ehandle, QEMUFile *f,
+uint8_t *ptr);
+int (*memcrypt_load_incoming_page_enc_bitmap)(void *ehandle, QEMUFile *f);
+int (*memcrypt_save_outgoing_page_enc_bitmap)(void *ehandle, QEMUFile *f,
+uint8_t *host, uint64_t length, unsigned long *bmap);
+int (*memcrypt_sync_page_enc_bitmap)(void *ehandle, uint8_t *host,
+uint64_t length, unsigned long *bmap);
 };
 
 KVMState *kvm_state;
@@ -164,6 +173,65 @@ int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
 return 1;
 }
 
+int kvm_memcrypt_save_outgoing_page(QEMUFile *f, uint8_t *ptr,
+uint32_t size, uint64_t *bytes_sent)
+{
+if (kvm_state->memcrypt_handle &&
+kvm_state->memcrypt_save_outgoing_page) {
+return 
kvm_state->memcrypt_save_outgoing_page(kvm_state->memcrypt_handle,
+f, ptr, size, bytes_sent);
+}
+
+return 1;
+}
+
+int kvm_memcrypt_load_incoming_page(QEMUFile *f, uint8_t *ptr)
+{
+if (kvm_state->memcrypt_handle &&
+kvm_state->memcrypt_load_incoming_page) {
+return 
kvm_state->memcrypt_load_incoming_page(kvm_state->memcrypt_handle,
+f, ptr);
+}
+
+return 1;
+}
+
+int kvm_memcrypt_load_incoming_page_enc_bitmap(QEMUFile *f)
+{
+if (kvm_state->memcrypt_handle &&
+kvm_state->memcrypt_load_incoming_page_enc_bitmap) {
+return kvm_state->memcrypt_load_incoming_page_enc_bitmap(
+kvm_state->memcrypt_handle, f);
+}
+
+return 1;
+}
+
+int kvm_memcrypt_save_outgoing_page_enc_bitmap(QEMUFile *f, uint8_t *host,
+   uint64_t length,
+   unsigned long *bmap)
+{
+if (kvm_state->memcrypt_handle &&
+kvm_state->memcrypt_save_outgoing_page_enc_bitmap) {
+return kvm_state->memcrypt_save_outgoing_page_enc_bitmap(
+kvm_state->memcrypt_handle, f, host, length, bmap);
+}
+
+return 1;
+}
+
+int kvm_memcrypt_sync_page_enc_bitmap(uint8_t *host, uint64_t length,
+  unsigned long *bmap)
+{
+if (kvm_state->memcrypt_handle &&
+kvm_state->memcrypt_sync_page_enc_bitmap) {
+return kvm_state->memcrypt_sync_page_enc_bitmap(
+kvm_state->memcrypt_handle, host, length, bmap);
+}
+
+return 1;
+}
+
 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 {
 KVMState *s = kvm_state;
diff --git a/accel/kvm/sev-stub.c b/accel/kvm/sev-stub.c
index 4f97452585..5d8c3f2ecd 100644
--- a/accel/kvm/sev-stub.c
+++ b/accel/kvm/sev-stub.c
@@ -24,3 +24,31 @@ void *sev_guest_init(const char *id)
 {
 return NULL;
 }
+
+int sev_save_outgoing_page(void *handle, QEMUFile *f, uint8_t *ptr,
+   uint32_t size, uint64_t *bytes_sent)
+{
+return 1;
+}
+
+int sev_load_incoming_page(void *handle, QEMUFile *f, uint8_t *ptr)
+{

[Qemu-devel] [RFC PATCH v1 12/12] target/i386: sev: remove migration blocker

2019-06-20 Thread Singh, Brijesh

Signed-off-by: Brijesh Singh 
---
 target/i386/sev.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/target/i386/sev.c b/target/i386/sev.c
index dc1e974d93..095ef4c729 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -34,7 +34,6 @@
 #define DEFAULT_SEV_DEVICE  "/dev/sev"
 
 static SEVState *sev_state;
-static Error *sev_mig_blocker;
 
 static const char *const sev_fw_errlist[] = {
 "",
@@ -685,7 +684,6 @@ static void
 sev_launch_finish(SEVState *s)
 {
 int ret, error;
-Error *local_err = NULL;
 
 trace_kvm_sev_launch_finish();
 ret = sev_ioctl(sev_state->sev_fd, KVM_SEV_LAUNCH_FINISH, 0, );
@@ -696,16 +694,6 @@ sev_launch_finish(SEVState *s)
 }
 
 sev_set_guest_state(SEV_STATE_RUNNING);
-
-/* add migration blocker */
-error_setg(_mig_blocker,
-   "SEV: Migration is not implemented");
-ret = migrate_add_blocker(sev_mig_blocker, _err);
-if (local_err) {
-error_report_err(local_err);
-error_free(sev_mig_blocker);
-exit(1);
-}
 }
 
 static int
-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 00/12] Add SEV guest live migration support

2019-06-20 Thread Singh, Brijesh

AMD SEV encrypts the memory of VMs and because this encryption is done using
an address tweak, the hypervisor will not be able to simply copy ciphertext
between machines to migrate a VM. Instead the AMD SEV Key Management API
provides a set of functions which the hypervisor can use to package a
guest encrypted pages for migration, while maintaining the confidentiality
provided by AMD SEV.

The patch series add the support required in Qemu to perform the SEV
guest live migration. Before initiating the live migration a user
should use newly added 'migrate-set-sev-info' command to pass the
target machines certificate chain. See the docs/amd-memory-encryption.txt
for further details.

The patch series depends on kernel patches available here:
https://marc.info/?l=kvm=156104873409876=2

The complete tree with patch is available at:
https://github.com/codomania/qemu/tree/sev-migration-rfc-v1

Brijesh Singh (12):
  linux-headers: update kernel header to include SEV migration commands
  kvm: introduce high-level API to support encrypted guest migration
  migration/ram: add support to send encrypted pages
  kvm: add support to sync the page encryption state bitmap
  doc: update AMD SEV API spec web link
  doc: update AMD SEV to include Live migration flow
  target/i386: sev: do not create launch context for an incoming guest
  target.json: add migrate-set-sev-info command
  target/i386: sev: add support to encrypt the outgoing page
  target/i386: sev: add support to load incoming encrypted page
  migration: add support to migrate page encryption bitmap
  target/i386: sev: remove migration blocker

 accel/kvm/kvm-all.c|  75 ++
 accel/kvm/sev-stub.c   |  28 ++
 accel/stubs/kvm-stub.c |  30 +++
 docs/amd-memory-encryption.txt |  46 +++-
 include/exec/ram_addr.h|   2 +
 include/sysemu/kvm.h   |  33 +++
 include/sysemu/sev.h   |   9 +
 linux-headers/linux/kvm.h  |  53 
 migration/ram.c| 121 -
 qapi/target.json   |  18 ++
 target/i386/monitor.c  |  10 +
 target/i386/sev-stub.c |   5 +
 target/i386/sev.c  | 471 +++--
 target/i386/sev_i386.h |  11 +-
 target/i386/trace-events   |   9 +
 15 files changed, 902 insertions(+), 19 deletions(-)

-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 11/12] migration: add support to migrate page encryption bitmap

2019-06-20 Thread Singh, Brijesh

When memory encryption is enabled, the hypervisor maintains a page
encryption bitmap which is referred by hypervisor during migratoin to check
if page is private or shared. The bitmap is built during the VM bootup and
must be migrated to the target host so that hypervisor on target host can
use it for future migration.

Signed-off-by: Brijesh Singh 
---
 accel/kvm/kvm-all.c  |  4 +++
 migration/ram.c  | 43 +-
 target/i386/sev.c| 56 
 target/i386/trace-events |  3 +++
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index fe65c8eb5d..0d75ad94f8 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1786,6 +1786,10 @@ static int kvm_init(MachineState *ms)
 kvm_state->memcrypt_sync_page_enc_bitmap = sev_sync_page_enc_bitmap;
 kvm_state->memcrypt_save_outgoing_page = sev_save_outgoing_page;
 kvm_state->memcrypt_load_incoming_page = sev_load_incoming_page;
+kvm_state->memcrypt_load_incoming_page_enc_bitmap =
+sev_load_incoming_page_enc_bitmap;
+kvm_state->memcrypt_save_outgoing_page_enc_bitmap =
+sev_save_outgoing_page_enc_bitmap;
 }
 
 ret = kvm_arch_init(ms, s);
diff --git a/migration/ram.c b/migration/ram.c
index a8631c0896..5c8403588f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -78,6 +78,7 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 #define RAM_SAVE_FLAG_ENCRYPTED_PAGE   0x200
+#define RAM_SAVE_FLAG_PAGE_ENCRYPTED_BITMAP   0x400
 
 static inline bool is_zero_range(uint8_t *p, uint64_t size)
 {
@@ -3551,6 +3552,35 @@ out:
 return done;
 }
 
+/**
+ * migration_save_page_enc_bitmap: function to send the page enc bitmap
+ *
+ * Returns zero to indicate success or negative on error
+ */
+static int migration_save_page_enc_bitmap(QEMUFile *f, RAMState *rs)
+{
+int r;
+RAMBlock *block;
+
+RAMBLOCK_FOREACH_MIGRATABLE(block) {
+/* ROM region does not encrypted data, skip sending the bitmap */
+if (memory_region_is_rom(block->mr)) {
+continue;
+}
+
+qemu_put_be64(f, RAM_SAVE_FLAG_PAGE_ENCRYPTED_BITMAP);
+qemu_put_byte(f, strlen(block->idstr));
+qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
+r = kvm_memcrypt_save_outgoing_page_enc_bitmap(f, block->host,
+block->max_length, block->encbmap);
+if (r) {
+return -1;
+}
+}
+
+return 0;
+}
+
 /**
  * ram_save_complete: function called to send the remaining amount of ram
  *
@@ -3595,6 +3625,10 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 flush_compressed_data(rs);
 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
 
+if (kvm_memcrypt_enabled()) {
+ret = migration_save_page_enc_bitmap(f, rs);
+}
+
 rcu_read_unlock();
 
 multifd_send_sync_main();
@@ -4343,7 +4377,8 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 
 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
  RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE |
- RAM_SAVE_FLAG_ENCRYPTED_PAGE)) {
+ RAM_SAVE_FLAG_ENCRYPTED_PAGE |
+ RAM_SAVE_FLAG_PAGE_ENCRYPTED_BITMAP)) {
 RAMBlock *block = ram_block_from_stream(f, flags);
 
 /*
@@ -4469,6 +4504,12 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 ret = -EINVAL;
 }
 break;
+case RAM_SAVE_FLAG_PAGE_ENCRYPTED_BITMAP:
+if (kvm_memcrypt_load_incoming_page_enc_bitmap(f)) {
+error_report("Failed to load page enc bitmap");
+ret = -EINVAL;
+}
+break;
 case RAM_SAVE_FLAG_EOS:
 /* normal exit */
 multifd_recv_sync_main();
diff --git a/target/i386/sev.c b/target/i386/sev.c
index b7feedce7d..dc1e974d93 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -896,6 +896,8 @@ int sev_sync_page_enc_bitmap(void *handle, uint8_t *host, 
uint64_t size,
 return 1;
 }
 
+trace_kvm_sev_sync_page_enc_bitmap(base_gpa, size);
+
 e.enc_bitmap = bitmap;
 e.start = base_gpa >> TARGET_PAGE_BITS;
 e.num_pages = pages;
@@ -1216,6 +1218,60 @@ int sev_load_incoming_page(void *handle, QEMUFile *f, 
uint8_t *ptr)
 return sev_receive_update_data(f, ptr);
 }
 
+int sev_load_incoming_page_enc_bitmap(void *handle, QEMUFile *f)
+{
+void *bmap;
+unsigned long pages, length;
+unsigned long bmap_size, base_gpa;
+struct kvm_page_enc_bitmap e = {};
+
+base_gpa = qemu_get_be64(f);
+length = qemu_get_be64(f);
+pages = length >> TARGET_PAGE_BITS;
+
+bmap_size = BITS_TO_LONGS(pages) * sizeof(unsigned long);
+bmap = g_malloc0(bmap_size);
+

[Qemu-devel] [RFC PATCH v1 08/12] target.json: add migrate-set-sev-info command

2019-06-20 Thread Singh, Brijesh

The command can be used by the hypervisor to specify the target Platform
Diffie-Hellman key (PDH) and certificate chain before starting the SEV
guest migration. The values passed through the command will be used while
creating the outgoing encryption context.

Signed-off-by: Brijesh Singh 
---
 qapi/target.json   | 18 ++
 target/i386/monitor.c  | 10 ++
 target/i386/sev-stub.c |  5 +
 target/i386/sev.c  | 11 +++
 target/i386/sev_i386.h |  9 -
 5 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/qapi/target.json b/qapi/target.json
index 1d4d54b600..4109772298 100644
--- a/qapi/target.json
+++ b/qapi/target.json
@@ -512,3 +512,21 @@
 ##
 { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'],
   'if': 'defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_I386) || 
defined(TARGET_S390X) || defined(TARGET_MIPS)' }
+
+##
+# @migrate-set-sev-info:
+#
+# The command is used to provide the target host information used during the
+# SEV guest.
+#
+# @pdh the target host platform diffie-hellman key encoded in base64
+#
+# @plat-cert the target host platform certificate chain encoded in base64
+#
+# @amd-cert AMD certificate chain which include ASK and OCA encoded in base64
+#
+# Since 4.3
+#
+##
+{ 'command': 'migrate-set-sev-info',
+  'data': { 'pdh': 'str', 'plat-cert': 'str', 'amd-cert' : 'str' }}
diff --git a/target/i386/monitor.c b/target/i386/monitor.c
index 56e2dbece7..68e2e2b8ec 100644
--- a/target/i386/monitor.c
+++ b/target/i386/monitor.c
@@ -736,3 +736,13 @@ SevCapability *qmp_query_sev_capabilities(Error **errp)
 
 return data;
 }
+
+void qmp_migrate_set_sev_info(const char *pdh, const char *plat_cert,
+  const char *amd_cert, Error **errp)
+{
+if (sev_enabled()) {
+sev_set_migrate_info(pdh, plat_cert, amd_cert);
+} else {
+error_setg(errp, "SEV is not enabled");
+}
+}
diff --git a/target/i386/sev-stub.c b/target/i386/sev-stub.c
index e5ee13309c..173bfa6374 100644
--- a/target/i386/sev-stub.c
+++ b/target/i386/sev-stub.c
@@ -48,3 +48,8 @@ SevCapability *sev_get_capabilities(void)
 {
 return NULL;
 }
+
+void sev_set_migrate_info(const char *pdh, const char *plat_cert,
+  const char *amd_cert)
+{
+}
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 1b05fcf9a9..2c7c496593 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -852,6 +852,17 @@ int sev_sync_page_enc_bitmap(void *handle, uint8_t *host, 
uint64_t size,
 return 0;
 }
 
+void sev_set_migrate_info(const char *pdh, const char *plat_cert,
+  const char *amd_cert)
+{
+SEVState *s = sev_state;
+
+s->remote_pdh = g_base64_decode(pdh, >remote_pdh_len);
+s->remote_plat_cert = g_base64_decode(plat_cert,
+  >remote_plat_cert_len);
+s->amd_cert = g_base64_decode(amd_cert, >amd_cert_len);
+}
+
 static void
 sev_register_types(void)
 {
diff --git a/target/i386/sev_i386.h b/target/i386/sev_i386.h
index c0f9373beb..258047ab2c 100644
--- a/target/i386/sev_i386.h
+++ b/target/i386/sev_i386.h
@@ -39,7 +39,8 @@ extern uint32_t sev_get_cbit_position(void);
 extern uint32_t sev_get_reduced_phys_bits(void);
 extern char *sev_get_launch_measurement(void);
 extern SevCapability *sev_get_capabilities(void);
-
+extern void sev_set_migrate_info(const char *pdh, const char *plat_cert,
+ const char *amd_cert);
 typedef struct QSevGuestInfo QSevGuestInfo;
 typedef struct QSevGuestInfoClass QSevGuestInfoClass;
 
@@ -81,6 +82,12 @@ struct SEVState {
 int sev_fd;
 SevState state;
 gchar *measurement;
+guchar *remote_pdh;
+size_t remote_pdh_len;
+guchar *remote_plat_cert;
+size_t remote_plat_cert_len;
+guchar *amd_cert;
+size_t amd_cert_len;
 };
 
 typedef struct SEVState SEVState;
-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 05/12] doc: update AMD SEV API spec web link

2019-06-20 Thread Singh, Brijesh

Signed-off-by: Brijesh Singh 
---
 docs/amd-memory-encryption.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/amd-memory-encryption.txt b/docs/amd-memory-encryption.txt
index 43bf3ee6a5..abb9a976f5 100644
--- a/docs/amd-memory-encryption.txt
+++ b/docs/amd-memory-encryption.txt
@@ -98,7 +98,7 @@ AMD Memory Encryption whitepaper:
 
http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf
 
 Secure Encrypted Virtualization Key Management:
-[1] http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf
+[1] https://developer.amd.com/sev/ (Secure Encrypted Virtualization API)
 
 KVM Forum slides:
 
http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf
-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 07/12] target/i386: sev: do not create launch context for an incoming guest

2019-06-20 Thread Singh, Brijesh

The LAUNCH_START is used for creating an encryption context to encrypt
newly created guest, for an incoming guest the RECEIVE_START should be
used.

Signed-off-by: Brijesh Singh 
---
 target/i386/sev.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/target/i386/sev.c b/target/i386/sev.c
index dd3814e25f..1b05fcf9a9 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -789,10 +789,16 @@ sev_guest_init(const char *id)
 goto err;
 }
 
-ret = sev_launch_start(s);
-if (ret) {
-error_report("%s: failed to create encryption context", __func__);
-goto err;
+/*
+ * The LAUNCH context is used for new guest, if its an incoming guest
+ * then RECEIVE context will be created after the connection is 
established.
+ */
+if (!runstate_check(RUN_STATE_INMIGRATE)) {
+ret = sev_launch_start(s);
+if (ret) {
+error_report("%s: failed to create encryption context", __func__);
+goto err;
+}
 }
 
 ram_block_notifier_add(_ram_notifier);
-- 
2.17.1

[Qemu-devel] [RFC PATCH v1 04/12] kvm: add support to sync the page encryption state bitmap

2019-06-20 Thread Singh, Brijesh

The SEV VMs have concept of private and shared memory. The private memory
is encrypted with guest-specific key, while shared memory may be encrypted
with hyperivosr key. The KVM_GET_PAGE_ENC_BITMAP can be used to get a
bitmap indicating whether the guest page is private or shared. A private
page must be transmitted using the SEV migration commands.

Signed-off-by: Brijesh Singh 
---
 accel/kvm/kvm-all.c |  1 +
 include/exec/ram_addr.h |  2 ++
 migration/ram.c | 28 +++-
 target/i386/sev.c   | 27 +++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 4d5ff8b9f5..0654d9a7cd 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1783,6 +1783,7 @@ static int kvm_init(MachineState *ms)
 }
 
 kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
+kvm_state->memcrypt_sync_page_enc_bitmap = sev_sync_page_enc_bitmap;
 }
 
 ret = kvm_arch_init(ms, s);
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index f96777bb99..2145059afc 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -51,6 +51,8 @@ struct RAMBlock {
 unsigned long *unsentmap;
 /* bitmap of already received pages in postcopy */
 unsigned long *receivedmap;
+/* bitmap of page encryption state for an encrypted guest */
+unsigned long *encbmap;
 };
 
 static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
diff --git a/migration/ram.c b/migration/ram.c
index 3c8977d508..a8631c0896 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1680,6 +1680,9 @@ static void migration_bitmap_sync_range(RAMState *rs, 
RAMBlock *rb,
 rs->migration_dirty_pages +=
 cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
   >num_dirty_pages_period);
+if (kvm_memcrypt_enabled()) {
+kvm_memcrypt_sync_page_enc_bitmap(rb->host, length, rb->encbmap);
+}
 }
 
 /**
@@ -2465,6 +2468,22 @@ static bool save_compress_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
 return false;
 }
 
+/**
+ * encrypted_test_bitmap: check if the page is encrypted
+ *
+ * Returns a bool indicating whether the page is encrypted.
+ */
+static bool encrypted_test_bitmap(RAMState *rs, RAMBlock *block,
+  unsigned long page)
+{
+/* ROM devices contains the unencrypted data */
+if (memory_region_is_rom(block->mr)) {
+return false;
+}
+
+return test_bit(page, block->encbmap);
+}
+
 /**
  * ram_save_target_page: save one target page
  *
@@ -2491,7 +2510,8 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss,
  * will take care of accessing the guest memory and re-encrypt it
  * for the transport purposes.
  */
- if (kvm_memcrypt_enabled()) {
+ if (kvm_memcrypt_enabled() &&
+ encrypted_test_bitmap(rs, pss->block, pss->page)) {
 return ram_save_encrypted_page(rs, pss, last_stage);
  }
 
@@ -2724,6 +2744,8 @@ static void ram_save_cleanup(void *opaque)
 block->bmap = NULL;
 g_free(block->unsentmap);
 block->unsentmap = NULL;
+g_free(block->encbmap);
+block->encbmap = NULL;
 }
 
 xbzrle_cleanup();
@@ -3251,6 +3273,10 @@ static void ram_list_init_bitmaps(void)
 block->unsentmap = bitmap_new(pages);
 bitmap_set(block->unsentmap, 0, pages);
 }
+if (kvm_memcrypt_enabled()) {
+block->encbmap = bitmap_new(pages);
+bitmap_set(block->encbmap, 0, pages);
+}
 }
 }
 }
diff --git a/target/i386/sev.c b/target/i386/sev.c
index 6dbdc3cdf1..dd3814e25f 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -819,6 +819,33 @@ sev_encrypt_data(void *handle, uint8_t *ptr, uint64_t len)
 return 0;
 }
 
+int sev_sync_page_enc_bitmap(void *handle, uint8_t *host, uint64_t size,
+unsigned long *bitmap)
+{
+int r;
+unsigned long base_gpa;
+KVMState *s = kvm_state;
+struct kvm_page_enc_bitmap e = {};
+unsigned long pages = size >> TARGET_PAGE_BITS;
+
+r = kvm_physical_memory_addr_from_host(kvm_state, host, _gpa);
+if (!r) {
+return 1;
+}
+
+e.enc_bitmap = bitmap;
+e.start = base_gpa >> TARGET_PAGE_BITS;
+e.num_pages = pages;
+
+if (kvm_vm_ioctl(s, KVM_GET_PAGE_ENC_BITMAP, ) == -1) {
+error_report("%s: get page_enc bitmap start 0x%llx pages 0x%llx",
+__func__, e.start, e.num_pages);
+return 1;
+}
+
+return 0;
+}
+
 static void
 sev_register_types(void)
 {
-- 
2.17.1

Re: [Qemu-devel] [PATCH v5 0/3] virtio-scsi: restart DMA after iothread

2019-06-20 Thread Paolo Bonzini

On 20/06/19 19:37, Stefan Hajnoczi wrote:
> v5:
>  * Plumbing vm change state handlers into DeviceClass/BusClass is a rather
>large bug fix.  Instead I've combined the previous priorities approach with
>the observation from Kevin and Paolo that we really want to order by qdev
>tree depth.
> 
>The new qdev_add_vm_change_state_handler() API lets DeviceStates register
>callbacks that execute in qdev tree depth order.  This solves the
>virtio-scsi bug since the virtio-scsi device's callback must complete 
> before
>its child scsi-disk's callback runs.
> 
>Is this a good compromise for everyone?

Yes!  Perhaps a bit of a hack, but it works and both the API and the
implementation are very sane.  Converting other devices to use
qdev_add_vm_change_state_handler() is left as an exercise for the
reviewer, I guess? :)

Paolo

Re: [Qemu-devel] [Qemu-riscv] [RFC v1 0/5] RISC-V: Add firmware loading support and default

2019-06-20 Thread Alistair Francis

On Thu, Jun 20, 2019 at 1:16 AM Andrea Bolognani  wrote:
>
> On Wed, 2019-06-19 at 11:23 -0700, Alistair Francis wrote:
> > On Wed, Jun 19, 2019 at 7:42 AM Bin Meng  wrote:
> > > On Wed, Jun 19, 2019 at 10:30 PM Alistair Francis  
> > > wrote:
> > > > On Wed, Jun 19, 2019 at 7:26 AM Bin Meng  wrote:
> > > > > >  pc-bios/opensbi-riscv32-fw_jump.elf | Bin 0 -> 197988 bytes
> > > > > >  pc-bios/opensbi-riscv64-fw_jump.elf | Bin 0 -> 200192 bytes
> > > > >
> > > > > Since we are considering adding "bios" images, I prefer to add the
> > > > > pure binary images instead of ELF images here.
> > > >
> > > > I didn't think about that. Can we just boot them in QEMU like we do
> > > > with the ELFs?
> > >
> > > Yes, use load_image_targphys() instead of load_elf().
> >
> > Ah, that is obvious. I'll update it to use the bin files then.
>
> I'm unclear on the advantages of using one format over the other,

The main one that I see is that everyone else is already using .bin
and no one else is using .elf.

> but one question comes to mind: once this is in, we will probably
> want to have OpenSBI packaged separately in distributions, the same
> way it already happens for SeaBIOS, SLOF and edk2-based firmwares.
>
> Will using either of the formats prevent that from happening?

Both options allow this.

OE-Core already packages OpenSBI by default, Fedora and Debian are
moving to OpenSBI for RISC-V targets as well.

Any distro that supports the RISC-V toolchain (which is all
upstreamed) can build OpenSBI.

Alistair

>
> --
> Andrea Bolognani / Red Hat / Virtualization
>

Re: [Qemu-devel] [PATCH 05/12] hbitmap: enable merging across granularities

2019-06-20 Thread John Snow




On 6/20/19 11:47 AM, Max Reitz wrote:
> On 20.06.19 03:03, John Snow wrote:
>> Signed-off-by: John Snow 
>> ---
>>  util/hbitmap.c | 22 +-
>>  1 file changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/util/hbitmap.c b/util/hbitmap.c
>> index 45d1725daf..0d6724b7bc 100644
>> --- a/util/hbitmap.c
>> +++ b/util/hbitmap.c
>> @@ -777,7 +777,17 @@ void hbitmap_truncate(HBitmap *hb, uint64_t size)
>>  
>>  bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b)
>>  {
>> -return (a->size == b->size) && (a->granularity == b->granularity);
>> +return (a->size == b->size);
>> +}
>> +
>> +static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src)
>> +{
>> +uint64_t offset = 0;
>> +uint64_t count = src->orig_size;
>> +
>> +while (hbitmap_next_dirty_area(src, , )) {
>> +hbitmap_set(dst, offset, count);
>> +}
>>  }
>>  
>>  /**
>> @@ -804,6 +814,16 @@ bool hbitmap_merge(const HBitmap *a, const HBitmap *b, 
>> HBitmap *result)
>>  return true;
>>  }
>>  
>> +if (a->size != b->size) {
> 
> Don’t you mean s/size/granularity/?
> 
> Right now, this is dead code, which leads me to asking for a test.
> (Well, no, I would’ve asked anyway.)
> 
> Max
> 

Ah, crud. Caught red-handed. Yes and Yes.

As to your later question: Can we use this for backup initialization?
Also yes; but it might be the case that we want the copy bitmap to
become a full-fledged "bdrv dirty bitmap" instead of an hbitmap, which
will actually make this easier and probably eliminate the need for the
"_take" or "_claim" function I added, too.

Re: [Qemu-devel] [PATCH v4 0/5] network announce; interface selection & IDs

2019-06-20 Thread Dr. David Alan Gilbert

* Jason Wang (jasow...@redhat.com) wrote:
> 
> On 2019/6/13 下午5:59, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" 
> > 
> > Laine asked for some extra features on the network announce support;
> 
> 
> It's better to explain why this feature is needed.

Yes, I'll reword.

> Is this because libvirt
> can change the host network topology on the fly?

It's because something can change the network topology on the fly - not
necessarily just libvirt.  Where as previously we were using the
announce mechanism for mainly migration reasons, now we also want
to use it to announce topology changes; those include potentially things
that libvirt gets told by a higher management layer - such as the
failure or one network path.

Dave

> 
> Thanks
> 
> 
> > 
> > The first allows the announce timer to announce on a subset of the
> > interfaces.
> > 
> > The second allows there to be multiple timers, each with their own
> > parameters (including the interface list).
> > 
> > Signed-off-by: Dr. David Alan Gilbert 
> > 
> > v4
> >Minor typo fixes
> >Expanded the test to check we can stop a running announce
> > 
> > Dr. David Alan Gilbert (5):
> >net/announce: Allow optional list of interfaces
> >net/announce: Add HMP optional interface list
> >net/announce: Add optional ID
> >net/announce: Add HMP optional ID
> >net/announce: Expand test for stopping self announce
> > 
> >   hmp-commands.hx |  7 +++-
> >   hmp.c   | 41 +++-
> >   hw/net/virtio-net.c |  4 +-
> >   include/net/announce.h  |  8 +++-
> >   net/announce.c  | 83 ++---
> >   net/trace-events|  3 +-
> >   qapi/net.json   | 16 ++--
> >   tests/virtio-net-test.c | 57 ++--
> >   8 files changed, 192 insertions(+), 27 deletions(-)
> > 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] virtual machine cpu soft lock when qemu attach disk

2019-06-20 Thread Stefan Hajnoczi

On Tue, Jun 04, 2019 at 04:39:00PM +0800, l00284672 wrote:
> Hi,  I found a problem that virtual machine cpu soft lock when I attach a
> disk to the vm in the case that
> 
> backend storage network has a large delay or IO pressure is too large.
> 
> 1) The disk xml which I attached is:
> 
>     
>   
>   
>   
>   
>   
>   
>     
> 
> 2) The bt of qemu main thread:
> 
> #0 0x9d78402c in pread64 () from /lib64/libpthread.so.0
> #1 0xce3357d8 in pread64 (__offset=0, __nbytes=4096,
> __buf=0xd47a5200, __fd=202) at /usr/include/bits/unistd.h:99
> #2 raw_is_io_aligned (fd=fd@entry=202, buf=buf@entry=0xd47a5200,
> len=len@entry=4096) at block/raw_posix.c:294
> #3 0xce33597c in raw_probe_alignment (bs=bs@entry=0xd32ea920,
> fd=202, errp=errp@entry=0xfef7a330) at block/raw_posix.c:349
> #4 0xce335a48 in raw_refresh_limits (bs=0xd32ea920,
> errp=0xfef7a330) at block/raw_posix.c:811
> #5 0xce3404b0 in bdrv_refresh_limits (bs=0xd32ea920,
> errp=0xfef7a330, errp@entry=0xfef7a360) at block/io.c:122
> #6 0xce340504 in bdrv_refresh_limits (bs=bs@entry=0xd09ce800,
> errp=errp@entry=0xfef7a3b0) at block/io.c:97
> #7 0xce2eb9f0 in bdrv_open_common (bs=bs@entry=0xd09ce800,
> file=file@entry=0xd0e89800, options=,
> errp=errp@entry=0xfef7a450)
> at block.c:1194
> #8 0xce2eedec in bdrv_open_inherit (filename=,
> filename@entry=0xd25f92d0
> "/dev/mapper/36384c4f100630193359db7a8011d",
> reference=reference@entry=0x0, options=,
> options@entry=0xd3d0f4b0, flags=, flags@entry=128,
> parent=parent@entry=0x0,
> child_role=child_role@entry=0x0, errp=errp@entry=0xfef7a710) at
> block.c:1895
> #9 0xce2ef510 in bdrv_open (filename=filename@entry=0xd25f92d0
> "/dev/mapper/36384c4f100630193359db7a8011d",
> reference=reference@entry=0x0,
> options=options@entry=0xd3d0f4b0, flags=flags@entry=128,
> errp=errp@entry=0xfef7a710) at block.c:1979
> #10 0xce331ef0 in blk_new_open
> (filename=filename@entry=0xd25f92d0
> "/dev/mapper/36384c4f100630193359db7a8011d",
> reference=reference@entry=0x0,
> options=options@entry=0xd3d0f4b0, flags=128,
> errp=errp@entry=0xfef7a710) at block/block_backend.c:213
> #11 0xce0da1f4 in blockdev_init (file=file@entry=0xd25f92d0
> "/dev/mapper/36384c4f100630193359db7a8011d",
> bs_opts=bs_opts@entry=0xd3d0f4b0,
> errp=errp@entry=0xfef7a710) at blockdev.c:603
> #12 0xce0dc478 in drive_new (all_opts=all_opts@entry=0xd4dc31d0,
> block_default_type=) at blockdev.c:1116
> #13 0xce0e3ee0 in add_init_drive (
> optstr=optstr@entry=0xd0872ec0 
> "file=/dev/mapper/36384c4f100630193359db7a8011d,format=raw,if=none,id=drive-scsi0-0-0-3,cache=none,aio=native")
> at device_hotplug.c:46
> #14 0xce0e3f78 in hmp_drive_add (mon=0xfef7a810,
> qdict=0xd0c8f000) at device_hotplug.c:67
> #15 0xcdf7d688 in handle_hmp_command (mon=0xfef7a810,
> cmdline=) at /usr/src/debug/qemu-kvm-2.8.1/monitor.c:3199
> #16 0xcdf7d778 in qmp_human_monitor_command (
> command_line=0xcfc8e3c0 "drive_add dummy 
> file=/dev/mapper/36384c4f100630193359db7a8011d,format=raw,if=none,id=drive-scsi0-0-0-3,cache=none,aio=native",
> 
> has_cpu_index=false, cpu_index=0, errp=errp@entry=0xfef7a968) at
> /usr/src/debug/qemu-kvm-2.8.1/monitor.c:660
> #17 0xce0fdb30 in qmp_marshal_human_monitor_command (args= out>, ret=0xfef7a9e0, errp=0xfef7a9d8) at qmp-marshal.c:2223
> #18 0xce3b6ad0 in do_qmp_dispatch (request=,
> errp=0xfef7aa20, errp@entry=0xfef7aa40) at qapi/qmp_dispatch.c:115
> #19 0xce3b6d58 in qmp_dispatch (request=) at
> qapi/qmp_dispatch.c:142
> #20 0xcdf79398 in handle_qmp_command (parser=,
> tokens=) at /usr/src/debug/qemu-kvm-2.8.1/monitor.c:4010
> #21 0xce3bd6c0 in json_message_process_token (lexer=0xcf834c80,
> input=, type=JSON_RCURLY, x=214, y=274) at
> qobject/json_streamer.c:105
> #22 0xce3f3d4c in json_lexer_feed_char
> (lexer=lexer@entry=0xcf834c80, ch=,
> flush=flush@entry=false) at qobject/json_lexer.c:319
> #23 0xce3f3e6c in json_lexer_feed (lexer=0xcf834c80,
> buffer=, size=) at qobject/json_lexer.c:369
> #24 0xcdf77c64 in monitor_qmp_read (opaque=,
> buf=, size=) at
> /usr/src/debug/qemu-kvm-2.8.1/monitor.c:4040
> #25 0xce0eab18 in tcp_chr_read (chan=,
> cond=, opaque=0xcf90b280) at qemu_char.c:3260
> #26 0x9dadf200 in g_main_context_dispatch () from
> /lib64/libglib-2.0.so.0
> #27 0xce3c4a00 in glib_pollfds_poll () at util/main_loop.c:230
> --Type  for more, q to quit, c to continue without paging--
> #28 0xce3c4a88 in os_host_main_loop_wait (timeout=)
> at util/main_loop.c:278
> #29 0xce3c4bf0 in main_loop_wait (nonblocking=) at
> util/main_loop.c:534
> #30 0xce0f5d08 in main_loop () at vl.c:2120
>

[Qemu-devel] [PATCH v5 0/3] virtio-scsi: restart DMA after iothread

2019-06-20 Thread Stefan Hajnoczi

v5:
 * Plumbing vm change state handlers into DeviceClass/BusClass is a rather
   large bug fix.  Instead I've combined the previous priorities approach with
   the observation from Kevin and Paolo that we really want to order by qdev
   tree depth.

   The new qdev_add_vm_change_state_handler() API lets DeviceStates register
   callbacks that execute in qdev tree depth order.  This solves the
   virtio-scsi bug since the virtio-scsi device's callback must complete before
   its child scsi-disk's callback runs.

   Is this a good compromise for everyone?

Stefan Hajnoczi (3):
  vl: add qemu_add_vm_change_state_handler_prio()
  qdev: add qdev_add_vm_change_state_handler()
  virtio-scsi: restart DMA after iothread

 hw/core/Makefile.objs |  1 +
 include/hw/qdev-core.h|  5 +++
 include/sysemu/sysemu.h   |  2 +
 hw/core/vm-change-state-handler.c | 61 +++
 hw/scsi/scsi-bus.c|  4 +-
 hw/virtio/virtio.c|  4 +-
 vl.c  | 59 --
 7 files changed, 120 insertions(+), 16 deletions(-)
 create mode 100644 hw/core/vm-change-state-handler.c

-- 
2.21.0

[Qemu-devel] [PATCH v5 1/3] vl: add qemu_add_vm_change_state_handler_prio()

2019-06-20 Thread Stefan Hajnoczi

Add an API for registering vm change state handlers with a well-defined
ordering.  This is necessary when handlers depend on each other.

Small coding style fixes are included to make checkpatch.pl happy.

Signed-off-by: Stefan Hajnoczi 
---
 include/sysemu/sysemu.h |  2 ++
 vl.c| 59 -
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 61579ae71e..984c439ac9 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -29,6 +29,8 @@ typedef void VMChangeStateHandler(void *opaque, int running, 
RunState state);
 
 VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
  void *opaque);
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+VMChangeStateHandler *cb, void *opaque, int priority);
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
 void vm_state_notify(int running, RunState state);
 
diff --git a/vl.c b/vl.c
index 99a56b5556..7fac2ae7ca 100644
--- a/vl.c
+++ b/vl.c
@@ -1471,28 +1471,57 @@ static int machine_help_func(QemuOpts *opts, 
MachineState *machine)
 struct vm_change_state_entry {
 VMChangeStateHandler *cb;
 void *opaque;
-QLIST_ENTRY (vm_change_state_entry) entries;
+QTAILQ_ENTRY(vm_change_state_entry) entries;
+int priority;
 };
 
-static QLIST_HEAD(, vm_change_state_entry) vm_change_state_head;
+static QTAILQ_HEAD(, vm_change_state_entry) vm_change_state_head;
 
-VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
- void *opaque)
+/**
+ * qemu_add_vm_change_state_handler_prio:
+ * @cb: the callback to invoke
+ * @opaque: user data passed to the callback
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ *true when the vm stops
+ *
+ * Register a callback function that is invoked when the vm starts or stops
+ * running.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+VMChangeStateHandler *cb, void *opaque, int priority)
 {
 VMChangeStateEntry *e;
+VMChangeStateEntry *other;
 
-e = g_malloc0(sizeof (*e));
-
+e = g_malloc0(sizeof(*e));
 e->cb = cb;
 e->opaque = opaque;
-QLIST_INSERT_HEAD(_change_state_head, e, entries);
+e->priority = priority;
+
+/* Keep list sorted in ascending priority order */
+QTAILQ_FOREACH(other, _change_state_head, entries) {
+if (priority < other->priority) {
+QTAILQ_INSERT_BEFORE(other, e, entries);
+return e;
+}
+}
+
+QTAILQ_INSERT_TAIL(_change_state_head, e, entries);
 return e;
 }
 
+VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
+ void *opaque)
+{
+return qemu_add_vm_change_state_handler_prio(cb, opaque, 0);
+}
+
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e)
 {
-QLIST_REMOVE (e, entries);
-g_free (e);
+QTAILQ_REMOVE(_change_state_head, e, entries);
+g_free(e);
 }
 
 void vm_state_notify(int running, RunState state)
@@ -1501,8 +1530,14 @@ void vm_state_notify(int running, RunState state)
 
 trace_vm_state_notify(running, state, RunState_str(state));
 
-QLIST_FOREACH_SAFE(e, _change_state_head, entries, next) {
-e->cb(e->opaque, running, state);
+if (running) {
+QTAILQ_FOREACH_SAFE(e, _change_state_head, entries, next) {
+e->cb(e->opaque, running, state);
+}
+} else {
+QTAILQ_FOREACH_REVERSE_SAFE(e, _change_state_head, entries, next) {
+e->cb(e->opaque, running, state);
+}
 }
 }
 
@@ -3025,7 +3060,7 @@ int main(int argc, char **argv, char **envp)
 exit(1);
 }
 
-QLIST_INIT (_change_state_head);
+QTAILQ_INIT(_change_state_head);
 os_setup_early_signal_handling();
 
 cpu_option = NULL;
-- 
2.21.0

[Qemu-devel] [PATCH v5 0/3] virtio-scsi: restart DMA after iothread

2019-06-20 Thread Stefan Hajnoczi

v5:
 * Plumbing vm change state handlers into DeviceClass/BusClass is a rather
   large bug fix.  Instead I've combined the previous priorities approach with
   the observation from Kevin and Paolo that we really want to order by qdev
   tree depth.

   The new qdev_add_vm_change_state_handler() API lets DeviceStates register
   callbacks that execute in qdev tree depth order.  This solves the
   virtio-scsi bug since the virtio-scsi device's callback must complete before
   its child scsi-disk's callback runs.

   Is this a good compromise for everyone?

Stefan Hajnoczi (3):
  vl: add qemu_add_vm_change_state_handler_prio()
  qdev: add qdev_add_vm_change_state_handler()
  virtio-scsi: restart DMA after iothread

 hw/core/Makefile.objs |  1 +
 include/hw/qdev-core.h|  5 +++
 include/sysemu/sysemu.h   |  2 +
 hw/core/vm-change-state-handler.c | 61 +++
 hw/scsi/scsi-bus.c|  4 +-
 hw/virtio/virtio.c|  4 +-
 vl.c  | 59 --
 7 files changed, 120 insertions(+), 16 deletions(-)
 create mode 100644 hw/core/vm-change-state-handler.c

-- 
2.21.0

[Qemu-devel] [PATCH v5 3/3] virtio-scsi: restart DMA after iothread

2019-06-20 Thread Stefan Hajnoczi

When the 'cont' command resumes guest execution the vm change state
handlers are invoked.  Unfortunately there is no explicit ordering
between classic qemu_add_vm_change_state_handler() callbacks.  When two
layers of code both use vm change state handlers, we don't control which
handler runs first.

virtio-scsi with iothreads hits a deadlock when a failed SCSI command is
restarted and completes before the iothread is re-initialized.

This patch uses the new qdev_add_vm_change_state_handler() API to
guarantee that virtio-scsi's virtio change state handler executes before
the SCSI bus children.  This way DMA is restarted after the iothread has
re-initialized.

Signed-off-by: Stefan Hajnoczi 
---
 hw/scsi/scsi-bus.c | 4 ++--
 hw/virtio/virtio.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 196136a307..fdc3a0e4e0 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -207,8 +207,8 @@ static void scsi_qdev_realize(DeviceState *qdev, Error 
**errp)
 error_propagate(errp, local_err);
 return;
 }
-dev->vmsentry = qemu_add_vm_change_state_handler(scsi_dma_restart_cb,
- dev);
+dev->vmsentry = qdev_add_vm_change_state_handler(DEVICE(dev),
+scsi_dma_restart_cb, dev);
 }
 
 static void scsi_qdev_unrealize(DeviceState *qdev, Error **errp)
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index e1e90fcfd6..e42e6710d2 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2354,8 +2354,8 @@ void virtio_init(VirtIODevice *vdev, const char *name,
 } else {
 vdev->config = NULL;
 }
-vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change,
- vdev);
+vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev),
+virtio_vmstate_change, vdev);
 vdev->device_endian = virtio_default_endian();
 vdev->use_guest_notifier_mask = true;
 }
-- 
2.21.0

[Qemu-devel] [PATCH v5 1/3] vl: add qemu_add_vm_change_state_handler_prio()

2019-06-20 Thread Stefan Hajnoczi

Add an API for registering vm change state handlers with a well-defined
ordering.  This is necessary when handlers depend on each other.

Small coding style fixes are included to make checkpatch.pl happy.

Signed-off-by: Stefan Hajnoczi 
---
 include/sysemu/sysemu.h |  2 ++
 vl.c| 59 -
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 61579ae71e..984c439ac9 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -29,6 +29,8 @@ typedef void VMChangeStateHandler(void *opaque, int running, 
RunState state);
 
 VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
  void *opaque);
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+VMChangeStateHandler *cb, void *opaque, int priority);
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
 void vm_state_notify(int running, RunState state);
 
diff --git a/vl.c b/vl.c
index 99a56b5556..7fac2ae7ca 100644
--- a/vl.c
+++ b/vl.c
@@ -1471,28 +1471,57 @@ static int machine_help_func(QemuOpts *opts, 
MachineState *machine)
 struct vm_change_state_entry {
 VMChangeStateHandler *cb;
 void *opaque;
-QLIST_ENTRY (vm_change_state_entry) entries;
+QTAILQ_ENTRY(vm_change_state_entry) entries;
+int priority;
 };
 
-static QLIST_HEAD(, vm_change_state_entry) vm_change_state_head;
+static QTAILQ_HEAD(, vm_change_state_entry) vm_change_state_head;
 
-VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
- void *opaque)
+/**
+ * qemu_add_vm_change_state_handler_prio:
+ * @cb: the callback to invoke
+ * @opaque: user data passed to the callback
+ * @priority: low priorities execute first when the vm runs and the reverse is
+ *true when the vm stops
+ *
+ * Register a callback function that is invoked when the vm starts or stops
+ * running.
+ *
+ * Returns: an entry to be freed using qemu_del_vm_change_state_handler()
+ */
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+VMChangeStateHandler *cb, void *opaque, int priority)
 {
 VMChangeStateEntry *e;
+VMChangeStateEntry *other;
 
-e = g_malloc0(sizeof (*e));
-
+e = g_malloc0(sizeof(*e));
 e->cb = cb;
 e->opaque = opaque;
-QLIST_INSERT_HEAD(_change_state_head, e, entries);
+e->priority = priority;
+
+/* Keep list sorted in ascending priority order */
+QTAILQ_FOREACH(other, _change_state_head, entries) {
+if (priority < other->priority) {
+QTAILQ_INSERT_BEFORE(other, e, entries);
+return e;
+}
+}
+
+QTAILQ_INSERT_TAIL(_change_state_head, e, entries);
 return e;
 }
 
+VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
+ void *opaque)
+{
+return qemu_add_vm_change_state_handler_prio(cb, opaque, 0);
+}
+
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e)
 {
-QLIST_REMOVE (e, entries);
-g_free (e);
+QTAILQ_REMOVE(_change_state_head, e, entries);
+g_free(e);
 }
 
 void vm_state_notify(int running, RunState state)
@@ -1501,8 +1530,14 @@ void vm_state_notify(int running, RunState state)
 
 trace_vm_state_notify(running, state, RunState_str(state));
 
-QLIST_FOREACH_SAFE(e, _change_state_head, entries, next) {
-e->cb(e->opaque, running, state);
+if (running) {
+QTAILQ_FOREACH_SAFE(e, _change_state_head, entries, next) {
+e->cb(e->opaque, running, state);
+}
+} else {
+QTAILQ_FOREACH_REVERSE_SAFE(e, _change_state_head, entries, next) {
+e->cb(e->opaque, running, state);
+}
 }
 }
 
@@ -3025,7 +3060,7 @@ int main(int argc, char **argv, char **envp)
 exit(1);
 }
 
-QLIST_INIT (_change_state_head);
+QTAILQ_INIT(_change_state_head);
 os_setup_early_signal_handling();
 
 cpu_option = NULL;
-- 
2.21.0

1 2 3 4 >

1 - 100 of 307 matches

Mail list logo