date:20200414

Re: [PATCH v7 03/48] nvme: move device parameters to separate struct

2020-04-14 Thread Philippe Mathieu-Daudé


On 4/15/20 7:50 AM, Klaus Jensen wrote:

From: Klaus Jensen 

Move device configuration parameters to separate struct to make it
explicit what is configurable and what is set internally.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
  hw/block/nvme.c | 44 ++--
  hw/block/nvme.h | 16 +---
  2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index da0e8af42823..249f759f076e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -64,12 +64,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
*buf, int size)
  
  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)

  {
-return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
+return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
  }
  
  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)

  {
-return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
+return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
  }
  
  static void nvme_inc_cq_tail(NvmeCQueue *cq)

@@ -631,7 +631,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
  trace_nvme_dev_err_invalid_create_cq_addr(prp1);
  return NVME_INVALID_FIELD | NVME_DNR;
  }
-if (unlikely(vector > n->num_queues)) {
+if (unlikely(vector > n->params.num_queues)) {
  trace_nvme_dev_err_invalid_create_cq_vector(vector);
  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
  }
@@ -783,7 +783,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
  break;
  case NVME_NUMBER_OF_QUEUES:
-result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 
16));
+result = cpu_to_le32((n->params.num_queues - 2) |
+ ((n->params.num_queues - 2) << 16));
  trace_nvme_dev_getfeat_numq(result);
  break;
  case NVME_TIMESTAMP:
@@ -827,9 +828,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
  case NVME_NUMBER_OF_QUEUES:
  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
  ((dw11 >> 16) & 0x) + 1,
-n->num_queues - 1, n->num_queues - 1);
-req->cqe.result =
-cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+n->params.num_queues - 1,
+n->params.num_queues - 1);
+req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
+  ((n->params.num_queues - 2) << 16));
  break;
  case NVME_TIMESTAMP:
  return nvme_set_feature_timestamp(n, cmd);
@@ -900,12 +902,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
  
  blk_drain(n->conf.blk);
  
-for (i = 0; i < n->num_queues; i++) {

+for (i = 0; i < n->params.num_queues; i++) {
  if (n->sq[i] != NULL) {
  nvme_free_sq(n->sq[i], n);
  }
  }
-for (i = 0; i < n->num_queues; i++) {
+for (i = 0; i < n->params.num_queues; i++) {
  if (n->cq[i] != NULL) {
  nvme_free_cq(n->cq[i], n);
  }
@@ -1306,7 +1308,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
  int64_t bs_size;
  uint8_t *pci_conf;
  
-if (!n->num_queues) {

+if (!n->params.num_queues) {
  error_setg(errp, "num_queues can't be zero");
  return;
  }
@@ -1322,7 +1324,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
  return;
  }
  
-if (!n->serial) {

+if (!n->params.serial) {
  error_setg(errp, "serial property not set");
  return;
  }
@@ -1339,25 +1341,25 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
  pcie_endpoint_cap_init(pci_dev, 0x80);
  
  n->num_namespaces = 1;

-n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
+n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
  n->ns_size = bs_size / (uint64_t)n->num_namespaces;
  
  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);

-n->sq = g_new0(NvmeSQueue *, n->num_queues);
-n->cq = g_new0(NvmeCQueue *, n->num_queues);
+n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
+n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
  
  memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,

"nvme", n->reg_size);
  pci_register_bar(pci_dev, 0,
  PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
  &n->iomem);
-msix_init_exclusive_bar(pci_dev, n->num_queues, 4, NULL);
+msix_init_exclusive_bar(pci_dev, n->params.num_queues, 4, NULL);
  
  id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));

Re: [PATCH v7 02/48] nvme: remove superfluous breaks

2020-04-14 Thread Philippe Mathieu-Daudé


On 4/15/20 7:50 AM, Klaus Jensen wrote:

From: Klaus Jensen 

These break statements was left over when commit 3036a626e9ef ("nvme:
add Get/Set Feature Timestamp support") was merged.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
  hw/block/nvme.c | 4 
  1 file changed, 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 01e18fb9eb1f..da0e8af42823 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -788,7 +788,6 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
  break;
  case NVME_TIMESTAMP:
  return nvme_get_feature_timestamp(n, cmd);
-break;
  default:
  trace_nvme_dev_err_invalid_getfeat(dw10);
  return NVME_INVALID_FIELD | NVME_DNR;
@@ -832,11 +831,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
  req->cqe.result =
  cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
  break;
-
  case NVME_TIMESTAMP:
  return nvme_set_feature_timestamp(n, cmd);
-break;
-
  default:
  trace_nvme_dev_err_invalid_setfeat(dw10);
  return NVME_INVALID_FIELD | NVME_DNR;



Reviewed-by: Philippe Mathieu-Daudé

Re: [EXTERNAL] [PATCH] target/ppc: Fix mtmsr(d) L=1 variant that loses interrupts

2020-04-14 Thread Cédric Le Goater

On 4/14/20 1:11 PM, Nicholas Piggin wrote:
> If mtmsr L=1 sets MSR[EE] while there is a maskable exception pending,
> it does not cause an interrupt. This causes the test case to hang:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.gnu.org_archive_html_qemu-2Dppc_2019-2D10_msg00826.html&d=DwIDAg&c=jf_iaSHvJObTbx-siA1ZOg&r=XHJsZhhuWSw9713Fp0ciew&m=TQfi_v-8XYgz7MiMDAZ_CjkyalSh9-EXhQ3oDesUm74&s=pFoesXbioVBh5wCuzEnzwgfze6X7e-a9unkfUgsRwiw&e=
>  
> 
> More recently, Linux reduced the occurance of operations (e.g., rfi)
> which stop translation and allow pending interrupts to be processed.
> This started causing hangs in Linux boot in long-running kernel tests,
> running with '-d int' shows the decrementer stops firing despite DEC
> wrapping and MSR[EE]=1.
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__lists.ozlabs.org_pipermail_linuxppc-2Ddev_2020-2DApril_208301.html&d=DwIDAg&c=jf_iaSHvJObTbx-siA1ZOg&r=XHJsZhhuWSw9713Fp0ciew&m=TQfi_v-8XYgz7MiMDAZ_CjkyalSh9-EXhQ3oDesUm74&s=EhkRfxvQMomvneYweWDEIUktCkKykgIqEmdhA0PtiwU&e=
>  
> 
> The cause is the broken mtmsr L=1 behaviour, which is contrary to the
> architecture. From Power ISA v3.0B, p.977, Move To Machine State Register,
> Programming Note states:
> 
> If MSR[EE]=0 and an External, Decrementer, or Performance Monitor
> exception is pending, executing an mtmsrd instruction that sets
> MSR[EE] to 1 will cause the interrupt to occur before the next
> instruction is executed, if no higher priority exception exists
> 
> Fix this by handling L=1 exactly the same way as L=0, modulo the MSR
> bits altered.
> 
> The confusion arises from L=0 being "context synchronizing" whereas L=1
> is "execution synchronizing", which is a weaker semantic. However this
> is not a relaxation of the requirement that these exceptions cause
> interrupts when MSR[EE]=1 (e.g., when mtmsr executes to completion as
> TCG is doing here), rather it specifies how a pipelined processor can
> have multiple instructions in flight where one may influence how another
> behaves.

I was expecting more changes but this looks fine. 

Reviewed-by: Cédric Le Goater 

> Cc: qemu-sta...@nongnu.org
> Reported-by: Anton Blanchard 
> Reported-by: Nathan Chancellor 
> Tested-by: Nathan Chancellor 
> Signed-off-by: Nicholas Piggin 

I gave it a try on PowerNV, pseries and mac99. All good.

Tested-by: Cédric Le Goater 

I don't know how we could include tests in QEMU such as the one Anton 
sent. These are good exercisers for our exception model.

Thanks,

C. 

> ---
> Thanks very much to Nathan for reporting and testing it, I added his
> Tested-by tag despite a more polished patch, as the the basics are 
> still the same (and still fixes his test case here).
> 
> This bug possibly goes back to early v2.04 / mtmsrd L=1 support around
> 2007, and the code has been changed several times since then so may
> require some backporting.
> 
> 32-bit / mtmsr untested at the moment, I don't have an environment
> handy.
>
> 
>  target/ppc/translate.c | 46 +-
>  1 file changed, 27 insertions(+), 19 deletions(-)
> 
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index b207fb5386..9959259dba 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -4361,30 +4361,34 @@ static void gen_mtmsrd(DisasContext *ctx)
>  CHK_SV;
>  
>  #if !defined(CONFIG_USER_ONLY)
> +if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> +gen_io_start();
> +}
>  if (ctx->opcode & 0x0001) {
> -/* Special form that does not need any synchronisation */
> +/* L=1 form only updates EE and RI */
>  TCGv t0 = tcg_temp_new();
> +TCGv t1 = tcg_temp_new();
>  tcg_gen_andi_tl(t0, cpu_gpr[rS(ctx->opcode)],
>  (1 << MSR_RI) | (1 << MSR_EE));
> -tcg_gen_andi_tl(cpu_msr, cpu_msr,
> +tcg_gen_andi_tl(t1, cpu_msr,
>  ~(target_ulong)((1 << MSR_RI) | (1 << MSR_EE)));
> -tcg_gen_or_tl(cpu_msr, cpu_msr, t0);
> +tcg_gen_or_tl(t1, t1, t0);
> +
> +gen_helper_store_msr(cpu_env, t1);
>  tcg_temp_free(t0);
> +tcg_temp_free(t1);
> +
>  } else {
>  /*
>   * XXX: we need to update nip before the store if we enter
>   *  power saving mode, we will exit the loop directly from
>   *  ppc_store_msr
>   */
> -if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
> -gen_io_start();
> -}
>  gen_update_nip(ctx, ctx->base.pc_next);
>  gen_helper_store_msr(cpu_env, cpu_gpr[rS(ctx->opcode)]);
> -/* Must stop the translation as machine state (may have) changed */
> -/* Note that mtmsr is not always defined as context-synchronizing */
> -gen_stop_exception(ctx);
>  }
> +/* Must stop the translation as machine state (may have) changed */
> +gen_stop_exception(ctx);
>  #endif /* !defined(CONFIG_U

Re: [PATCH v41 01/21] target/avr: Add outward facing interfaces and core CPU logic

2020-04-14 Thread Philippe Mathieu-Daudé

Hi Michael,

On 4/12/20 11:14 AM, Michael Rolnik wrote:
> hi all.
> 
> are there any news / updates about AVR support?

Aleksandar is not very active on qemu-devel recently, I suppose he is busy.

I have ~15 patches on top of your series, but I'm waiting update on the
status on this target before posting them.

> 
> On Mon, Mar 23, 2020 at 10:14 PM Michael Rolnik  > wrote:
> 
> thanks Philippe.
> 
> On Mon, Mar 23, 2020 at 9:20 PM Philippe Mathieu-Daudé
> mailto:phi...@redhat.com>> wrote:
> 
> On 3/23/20 7:03 PM, Richard Henderson wrote:
> > On 3/23/20 10:03 AM, Michael Rolnik wrote:
> >> Hi Philippe.
> >>
> >> It's been a while. let me think about it and get back to you.
> what is your
> >> concern ?
> 
> We are using this series with Joaquin for a Google Summit of Code
> project, so we are noticing some bugs and fixing them.
> As it has not been merged, we work in a fork.
> Since it was posted on the list, I prefer to ask on the list than
> directly to you.
> 
> >
> > It shouldn't be there.  See commit 1f5c00cfdb81.
> 
> Ah it has been moved to cpu_common_reset, thanks :)
> I suppose it is because this port is based on some quite old work.
> 
> >
> >>      > +    memset(env->r, 0, sizeof(env->r));
> >>      > +
> >>      > +    tlb_flush(cs);
> >>
> >>      Why are you calling tlb_flush() here?
> >
> >
> > r~
> >
> 
> 
> 
> -- 
> Best Regards,
> Michael Rolnik
> 
> 
> 
> -- 
> Best Regards,
> Michael Rolnik

Re: [Qemu devel PATCH v5 1/3] hw/net: Add Smartfusion2 emac block

2020-04-14 Thread Philippe Mathieu-Daudé

On 4/14/20 5:02 PM, sundeep.l...@gmail.com wrote:
> From: Subbaraya Sundeep 
> 
> Modelled Ethernet MAC of Smartfusion2 SoC.
> Micrel KSZ8051 PHY is present on Emcraft's
> SOM kit hence same PHY is emulated.
> 
> Signed-off-by: Subbaraya Sundeep 
> Reviewed-by: Philippe Mathieu-Daudé 
> Tested-by: Philippe Mathieu-Daudé 
> ---
>  MAINTAINERS|   2 +
>  hw/net/Makefile.objs   |   1 +
>  hw/net/msf2-emac.c | 566 
> +
>  include/hw/net/msf2-emac.h |  53 +
>  4 files changed, 622 insertions(+)
>  create mode 100644 hw/net/msf2-emac.c
>  create mode 100644 include/hw/net/msf2-emac.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 642c8e0..9d0ff20 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -919,6 +919,8 @@ F: include/hw/arm/msf2-soc.h
>  F: include/hw/misc/msf2-sysreg.h
>  F: include/hw/timer/mss-timer.h
>  F: include/hw/ssi/mss-spi.h
> +F: hw/net/msf2-emac.c
> +F: include/hw/net/msf2-emac.h
>  
>  Emcraft M2S-FG484
>  M: Subbaraya Sundeep 
> diff --git a/hw/net/Makefile.objs b/hw/net/Makefile.objs
> index af4d194..f2b7398 100644
> --- a/hw/net/Makefile.objs
> +++ b/hw/net/Makefile.objs
> @@ -55,3 +55,4 @@ common-obj-$(CONFIG_ROCKER) += rocker/rocker.o 
> rocker/rocker_fp.o \
>  obj-$(call lnot,$(CONFIG_ROCKER)) += rocker/qmp-norocker.o
>  
>  common-obj-$(CONFIG_CAN_BUS) += can/
> +common-obj-$(CONFIG_MSF2) += msf2-emac.o
> diff --git a/hw/net/msf2-emac.c b/hw/net/msf2-emac.c
> new file mode 100644
> index 000..cae40fd
> --- /dev/null
> +++ b/hw/net/msf2-emac.c
> @@ -0,0 +1,566 @@
> +/*
> + * QEMU model of the Smartfusion2 Ethernet MAC.
> + *
> + * Copyright (c) 2020 Subbaraya Sundeep .
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + *
> + * Refer to section Ethernet MAC in the document:
> + * UG0331: SmartFusion2 Microcontroller Subsystem User Guide
> + * Datasheet URL:
> + * 
> https://www.microsemi.com/document-portal/cat_view/56661-internal-documents/
> + * 56758-soc?lang=en&limit=20&limitstart=220
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "qemu/log.h"
> +#include "qapi/error.h"
> +#include "exec/address-spaces.h"
> +#include "hw/registerfields.h"
> +#include "hw/net/msf2-emac.h"
> +#include "hw/net/mii.h"
> +#include "hw/irq.h"
> +#include "hw/qdev-properties.h"
> +#include "migration/vmstate.h"
> +
> +REG32(CFG1, 0x0)
> +FIELD(CFG1, RESET, 31, 1)
> +FIELD(CFG1, RX_EN, 2, 1)
> +FIELD(CFG1, TX_EN, 0, 1)
> +FIELD(CFG1, LB_EN, 8, 1)
> +REG32(CFG2, 0x4)
> +REG32(IFG, 0x8)
> +REG32(HALF_DUPLEX, 0xc)
> +REG32(MAX_FRAME_LENGTH, 0x10)
> +REG32(MII_CMD, 0x24)
> +FIELD(MII_CMD, READ, 0, 1)
> +REG32(MII_ADDR, 0x28)
> +FIELD(MII_ADDR, REGADDR, 0, 5)
> +FIELD(MII_ADDR, PHYADDR, 8, 5)
> +REG32(MII_CTL, 0x2c)
> +REG32(MII_STS, 0x30)
> +REG32(STA1, 0x40)
> +REG32(STA2, 0x44)
> +REG32(FIFO_CFG0, 0x48)
> +REG32(DMA_TX_CTL, 0x180)
> +FIELD(DMA_TX_CTL, EN, 0, 1)
> +REG32(DMA_TX_DESC, 0x184)
> +REG32(DMA_TX_STATUS, 0x188)
> +FIELD(DMA_TX_STATUS, PKTCNT, 16, 8)
> +FIELD(DMA_TX_STATUS, UNDERRUN, 1, 1)
> +FIELD(DMA_TX_STATUS, PKT_SENT, 0, 1)
> +REG32(DMA_RX_CTL, 0x18c)
> +FIELD(DMA_RX_CTL, EN, 0, 1)
> +REG32(DMA_RX_DESC, 0x190)
> +REG32(DMA_RX_STATUS, 0x194)
> +FIELD(DMA_RX_STATUS, PKTCNT, 16, 8)
> +FIELD(DMA_RX_STATUS, OVERFLOW, 2, 1)
> +FIELD(DMA_RX_STATUS, PKT_RCVD, 0, 1)
> +REG32(DMA_IRQ_MASK, 0x198)
> +REG32(DMA_IRQ, 0x19c)
> +
> +#define EMPTY_MASK  (1 << 31)
> +#define PKT_SIZE0x7FF
> +#define PHYADDR 0x1
> +#define MAX_PKT_SIZE2048
> +
> +typedef struct {
> +uint32_t pktaddr;
> +uint32_t pktsize;
> +uint32_t next;
> +} EmacDesc;
> +
> +static uint32_t emac_get_isr(MSF2EmacState *s)
> +{
> +uint32_t ier = s->regs[R_DMA_IRQ_MASK];
> +uint32_t tx = s->regs[R_DMA_TX_STATUS] & 0xF;
> +uin

Re: [PATCH v5 for-5.0] configure: warn if not using a separate build directory

2020-04-14 Thread Markus Armbruster

Peter Maydell  writes:

> On Mon, 6 Apr 2020 at 16:33, Daniel P. Berrangé  wrote:
>>
>> Running configure directly from the source directory is a build
>> configuration that will go away in future. It is also not currently
>> covered by any automated testing. Display a deprecation warning if
>> the user attempts to use an in-srcdir build setup, so that they are
>> aware that they're building QEMU in an undesirable manner.

The warning text has evolved since v5, but the commit message hasn't
quite kept up, I think.

>>
>> Reviewed-by: Aleksandar Markovic 
>> Tested-by: Philippe Mathieu-Daudé 
>> Signed-off-by: Daniel P. Berrangé 
>> ---
>
> Given where we are in the release cycle, I think this isn't
> going to go in for 5.0; and it's not really that urgent now
> we've decided we don't want to actually deprecate in-tree builds.

Have we?

We had a Aleksandar assert that out-of-tree builds can't do certain
things, which led to the decision to soften this patch's warning from
"deprecated; better use the grace period to adjust, and here's how to"
to "not recommended; here's the recommended way".  Since we know in-tree
builds are more fragile, we owe our users such a warning.  We should've
added it long ago.

We also had a few people telling us that in-tree builds are so much more
convenient for them that we doing extra work to keep them working for
them is totally worth it for them.  SCNR.

Whether we want to keep sinking time & energy into an extra way to build
will become irrelevant once we move to Meson, unless Meson deviates from
its "this is an opinionated build tool, not a 'give users all the rope
they may possibly want, and then some'" approach in a surprising lapse
of judgement.

> I've removed the text I put into the changelog about this earlier.

Pity.

If we can't reach consensus in time for 5.0, that's regrettable, but I
accept it.  Our decision making process is open and slow.  Hard to get
one without the other.

Much harder to accept is us once again defaulting to do nothing because
deciding what to do involves a tradeoff.

[PATCH v7 46/48] pci: allocate pci id for nvme

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The emulated nvme device (hw/block/nvme.c) is currently using an
internal Intel device id.

Prepare to change that by allocating a device id under the 1b36 (Red
Hat, Inc.) vendor id.

Signed-off-by: Klaus Jensen 
Cc: Gerd Hoffmann 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 MAINTAINERS|  1 +
 docs/specs/nvme.txt| 25 +
 docs/specs/pci-ids.txt |  1 +
 include/hw/pci/pci.h   |  1 +
 4 files changed, 28 insertions(+)
 create mode 100644 docs/specs/nvme.txt

diff --git a/MAINTAINERS b/MAINTAINERS
index 5f93e8c01d34..b4bbc58b668b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1736,6 +1736,7 @@ L: qemu-bl...@nongnu.org
 S: Supported
 F: hw/block/nvme*
 F: tests/qtest/nvme-test.c
+F: docs/specs/nvme.txt
 
 megasas
 M: Hannes Reinecke 
diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
new file mode 100644
index ..b51552cb5c3f
--- /dev/null
+++ b/docs/specs/nvme.txt
@@ -0,0 +1,25 @@
+NVM Express Controller
+==
+
+The nvme device (-device nvme) emulates an NVM Express Controller.
+
+
+Reference Specifications
+
+
+The device currently implements most mandatory features of NVMe v1.3d, see
+
+  https://nvmexpress.org/resources/specifications/
+
+for the specification.
+
+
+Known issues
+
+
+* The device does not have any way of storing persistent state, so minor parts
+  of the implementation is in violation of the specification:
+- The accounting numbers in the SMART/Health are reset across power cycles
+
+* Interrupt Coalescing is not supported and is disabled by default in volation
+  of the specification.
diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
index 4d53e5c7d9d5..abbdbca6be38 100644
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -63,6 +63,7 @@ PCI devices (other than virtio):
 1b36:000b  PCIe Expander Bridge (-device pxb-pcie)
 1b36:000d  PCI xhci usb host adapter
 1b36:000f  mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c
+1b36:0010  PCIe NVMe device (-device nvme)
 
 All these devices are documented in docs/specs.
 
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index da9057b8db97..92231885bc23 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -104,6 +104,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_REDHAT_XHCI0x000d
 #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e
 #define PCI_DEVICE_ID_REDHAT_MDPY0x000f
+#define PCI_DEVICE_ID_REDHAT_NVME0x0010
 #define PCI_DEVICE_ID_REDHAT_QXL 0x0100
 
 #define FMT_PCIBUS  PRIx64
-- 
2.26.0

[PATCH v7 47/48] nvme: change controller pci id

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

There are two reasons for changing this:

  1. The nvme device currently uses an internal Intel device id.

  2. Since commits "nvme: fix write zeroes offset and count" and "nvme:
 support multiple namespaces" the controller device no longer has
 the quirks that the Linux kernel think it has.

 As the quirks are applied based on pci vendor and device id, change
 them to get rid of the quirks.

To keep backward compatibility, add a new 'x-use-intel-id' parameter to
the nvme device to force use of the Intel vendor and device id. This is
off by default but add a compat property to set this for machines 4.2
and older.

Signed-off-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 13 +
 hw/block/nvme.h   |  4 +++-
 hw/core/machine.c |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e338d0893a70..40a400333828 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2544,8 +2544,15 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 pci_conf[PCI_INTERRUPT_PIN] = 1;
 pci_config_set_prog_interface(pci_conf, 0x2);
-pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
-pci_config_set_device_id(pci_conf, 0x5845);
+
+if (n->params.use_intel_id) {
+pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
+pci_config_set_device_id(pci_conf, 0x5846);
+} else {
+pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
+pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
+}
+
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
@@ -2727,8 +2734,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 pc->realize = nvme_realize;
 pc->exit = nvme_exit;
 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
-pc->vendor_id = PCI_VENDOR_ID_INTEL;
-pc->device_id = 0x5845;
 pc->revision = 2;
 
 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index f42c17651b7b..615a6ff5d13d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -13,7 +13,8 @@
 DEFINE_PROP_UINT32("max_ioqpairs", _state, _props.max_ioqpairs, 64), \
 DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3), \
 DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64), \
-DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7)
+DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7), \
+DEFINE_PROP_BOOL("x-use-intel-id", _state, _props.use_intel_id, false)
 
 typedef struct NvmeParams {
 char *serial;
@@ -23,6 +24,7 @@ typedef struct NvmeParams {
 uint8_t  aerl;
 uint32_t aer_max_queued;
 uint8_t  mdts;
+bool use_intel_id;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
diff --git a/hw/core/machine.c b/hw/core/machine.c
index c1a444cb7558..de972a7e45dc 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -40,6 +40,7 @@ GlobalProperty hw_compat_4_2[] = {
 { "qxl", "revision", "4" },
 { "qxl-vga", "revision", "4" },
 { "fw_cfg", "acpi-mr-restore", "false" },
+{ "nvme", "x-use-intel-id", "on"},
 };
 const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2);
 
-- 
2.26.0

[PATCH v7 41/48] nvme: harden cmb access

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Since the controller has only supported PRPs so far it has not been
required to check the ending address (addr + len - 1) of the CMB access
for validity since it has been guaranteed to be in range of the CMB.

This changes when the controller adds support for SGLs (next patch), so
add that check.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6dcd9c4b4cd0..5140bc32913d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
+hwaddr hi = addr + size - 1;
+if (hi < addr) {
+return 1;
+}
+
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 return 0;
 }
-- 
2.26.0

[PATCH v7 36/48] nvme: allow multiple aios per command

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

This refactors how the device issues asynchronous block backend
requests. The NvmeRequest now holds a queue of NvmeAIOs that are
associated with the command. This allows multiple aios to be issued for
a command. Only when all requests have been completed will the device
post a completion queue entry.

Because the device is currently guaranteed to only issue a single aio
request per command, the benefit is not immediately obvious. But this
functionality is required to support metadata, the dataset management
command and other features.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c   | 328 +-
 hw/block/nvme.h   | 101 +++--
 hw/block/trace-events |   3 +
 3 files changed, 350 insertions(+), 82 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 846aa31eaae9..c123be10fd0d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -59,6 +59,7 @@
 } while (0)
 
 static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
 
 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
 {
@@ -163,6 +164,17 @@ static void nvme_req_clear(NvmeRequest *req)
 {
 req->ns = NULL;
 memset(&req->cqe, 0x0, sizeof(req->cqe));
+req->status = NVME_SUCCESS;
+req->slba = req->nlb = 0x0;
+req->cb = req->cb_arg = NULL;
+
+if (req->qsg.sg) {
+qemu_sglist_destroy(&req->qsg);
+}
+
+if (req->iov.iov) {
+qemu_iovec_destroy(&req->iov);
+}
 }
 
 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
@@ -388,6 +400,109 @@ static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
 }
 
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+g_free(aio);
+}
+
+/*
+ * Register an asynchronous I/O operation with the NvmeRequest. The NvmeRequest
+ * will not complete until all registered AIO's have completed and the
+ * aio_tailq goes empty.
+ */
+static inline void nvme_req_add_aio(NvmeRequest *req, NvmeAIO *aio,
+NvmeAIOOp opc)
+{
+aio->opc = opc;
+
+trace_nvme_dev_req_add_aio(nvme_cid(req), aio, blk_name(aio->blk),
+   aio->offset, aio->len,
+   nvme_aio_opc_str(aio), req);
+
+if (req) {
+QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
+}
+}
+
+/*
+ * Submit an asynchronous I/O operation as described by the given NvmeAIO. This
+ * function takes care of accounting and special handling of reads and writes
+ * going to the Controller Memory Buffer.
+ */
+static void nvme_submit_aio(NvmeAIO *aio)
+{
+BlockBackend *blk = aio->blk;
+BlockAcctCookie *acct = &aio->acct;
+BlockAcctStats *stats = blk_get_stats(blk);
+
+bool is_write;
+
+switch (aio->opc) {
+case NVME_AIO_OPC_NONE:
+break;
+
+case NVME_AIO_OPC_FLUSH:
+block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+break;
+
+case NVME_AIO_OPC_WRITE_ZEROES:
+block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+   BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
+   aio);
+break;
+
+case NVME_AIO_OPC_READ:
+case NVME_AIO_OPC_WRITE:
+is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+block_acct_start(stats, acct, aio->len,
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+if (aio->qsg) {
+if (is_write) {
+aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset,
+   BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+} else {
+aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset,
+  BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+}
+} else {
+if (is_write) {
+aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
+ nvme_aio_cb, aio);
+} else {
+aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0,
+nvme_aio_cb, aio);
+}
+}
+
+break;
+}
+}
+
+static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
+{
+NvmeAIO *aio;
+size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
+
+aio = g_new0(NvmeAIO, 1);
+
+*aio = (NvmeAIO) {
+.blk = blk,
+.offset = offset,
+.len = len,
+.req = req,
+.qsg = req->qsg.sg ? &req->qsg : NULL,
+.iov = req->iov.iov ? &req->iov : NULL,
+};
+
+nvme_req_add_aio(req, aio, nvme_req_is_write(req) ?
+

[PATCH v7 28/48] nvme: pass request along for tracing

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 67 +--
 hw/block/trace-events |  2 +-
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2ff7dd695cd7..66f92f6f6f2d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -210,14 +210,18 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 return NVME_SUCCESS;
 }
 
-static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
- uint64_t prp2, uint32_t len, NvmeCtrl *n)
+static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ uint64_t prp1, uint64_t prp2, uint32_t len,
+ NvmeRequest *req)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 uint16_t status;
 
+trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
+   num_prps);
+
 if (unlikely(!prp1)) {
 trace_nvme_dev_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -308,13 +312,14 @@ unmap:
 }
 
 static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir)
+ uint64_t prp1, uint64_t prp2, DMADirection dir,
+ NvmeRequest *req)
 {
 QEMUSGList qsg;
 QEMUIOVector iov;
 uint16_t status = NVME_SUCCESS;
 
-status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n);
+status = nvme_map_prp(n, &qsg, &iov, prp1, prp2, len, req);
 if (status) {
 return status;
 }
@@ -559,7 +564,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 return NVME_LBA_RANGE | NVME_DNR;
 }
 
-if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
+if (nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, data_size, req)) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 return NVME_INVALID_FIELD | NVME_DNR;
 }
@@ -784,7 +789,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint8_t rae,
 }
 
 return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
-DMA_DIRECTION_FROM_DEVICE);
+DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
@@ -804,7 +809,7 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint32_t buf_len,
 trans_len = MIN(sizeof(fw_log) - off, buf_len);
 
 return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
-DMA_DIRECTION_FROM_DEVICE);
+DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
@@ -829,7 +834,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint8_t rae,
 trans_len = MIN(sizeof(errlog) - off, buf_len);
 
 return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2,
-DMA_DIRECTION_FROM_DEVICE);
+DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
@@ -970,7 +975,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 return NVME_SUCCESS;
 }
 
-static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
+static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c,
+   NvmeRequest *req)
 {
 uint64_t prp1 = le64_to_cpu(c->prp1);
 uint64_t prp2 = le64_to_cpu(c->prp2);
@@ -978,10 +984,11 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
NvmeIdentify *c)
 trace_nvme_dev_identify_ctrl();
 
 return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
-prp2, DMA_DIRECTION_FROM_DEVICE);
+prp2, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
+static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
+ NvmeRequest *req)
 {
 NvmeNamespace *ns;
 uint32_t nsid = le32_to_cpu(c->nsid);
@@ -998,10 +1005,11 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeIdentify *c)
 ns = &n->namespaces[nsid - 1];
 
 return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
-prp2, DMA_DIRECTION_FROM_DEVICE);
+prp2, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
+static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c,
+ NvmeRequest *req)
 {
 static const int data_len = NVME_IDENTIFY_DATA_SIZE;

Re: [PATCH-for-5.1 v3 02/24] scripts/coccinelle: Script to simplify DeviceClass error propagation

2020-04-14 Thread Philippe Mathieu-Daudé

On 4/14/20 3:17 PM, Markus Armbruster wrote:
> Philippe Mathieu-Daudé  writes:
> 
>> On 4/14/20 2:24 PM, Markus Armbruster wrote:
>>> Philippe Mathieu-Daudé  writes:
>>>
 When a device uses an Error* with data not modified before realize(),
 this call can be moved to init(). Add a Coccinelle patch to find such
 uses.

 Signed-off-by: Philippe Mathieu-Daudé 
 ---
   ...implify-init-realize-error_propagate.cocci | 69 +++
   MAINTAINERS   |  1 +
   2 files changed, 70 insertions(+)
   create mode 100644 
 scripts/coccinelle/simplify-init-realize-error_propagate.cocci

 diff --git 
 a/scripts/coccinelle/simplify-init-realize-error_propagate.cocci 
 b/scripts/coccinelle/simplify-init-realize-error_propagate.cocci
 new file mode 100644
 index 00..2e3ec4d98a
 --- /dev/null
 +++ b/scripts/coccinelle/simplify-init-realize-error_propagate.cocci
 @@ -0,0 +1,69 @@
 +// Find error-propagation calls that don't need to be in 
 DeviceClass::realize()
 +// because they don't use information user can change before calling 
 realize(),
 +// so they can be moved to DeviceClass:initfn() where error propagation 
 is not
 +// needed.
 +//
 +// Copyright: (C) 2020 Philippe Mathieu-Daudé
 +// This work is licensed under the terms of the GNU GPLv2 or later.
 +//
 +// spatch \
 +//  --macro-file scripts/cocci-macro-file.h \
 +//  --sp-file \
 +//scripts/coccinelle/simplify-init-realize-error_propagate.cocci \
 +//  --timeout 60
 +//
 +// Inspired by 
 https://www.mail-archive.com/qemu-devel@nongnu.org/msg692500.html
 +
 +
 +@ match_class_init @
 +TypeInfo info;
 +identifier class_initfn;
 +@@
 +info.class_init = class_initfn;
 +
 +
 +@ match_instance_init @
 +TypeInfo info;
 +identifier instance_initfn;
 +@@
 +info.instance_init = instance_initfn;
 +
 +
 +@ match_realize @
 +identifier match_class_init.class_initfn;
 +DeviceClass *dc;
 +identifier realizefn;
 +@@
 +void class_initfn(...)
 +{
 +...
 +dc->realize = realizefn;
 +...
 +}
>>>
>>> I'm afraid this misses realize() methods of DeviceClass subclasses.
>>> Consider PCI device "i6300esb" (picked just because it's simple).
>>>
>>> pci_device_class_init() sets DeviceClass method realize() to
>>> pci_qdev_realize().  pci_qdev_realize() does the work common to all PCI
>>> devices, and calls PCIDeviceClass method realize() for the work specific
>>> to the PCI device at hand.
>>>
>>> i6300esb_class_init() sets PCIDeviceClass method realize() to
>>> i6300esb_realize().
>>>
>>> Your first rule should match i6300esb_info alright, and thus identify
>>> i6300esb_class_init() as a class_init() method.
>>>
>>> But your third rule can't match i6300esb_class_init()'s
>>>
>>>  k->realize = i6300esb_realize;
>>>
>>> because @k is a PCIDeviceClass, not a DeviceClass.
>>>
>>> I think it also misses cases that have a realize(), but no
>>> instance_init().
>>>
>>> Finding only some instances of an anti-pattern can still be useful.  But
>>> you should explain the script's limitations then, both in the script and
>>> the commit message.
>>
>> OK.
>>
>>>
 +
 +
 +@ propagate_in_realize @
 +identifier match_realize.realizefn;
 +identifier err;
 +identifier errp;
 +identifier func_with_errp =~ "(?!object_property_set_link)";
>>>
>>> What are you trying to accomplish with this lookahead assertion?
>>
>> "match all func_with_errp() except object_property_set_link()"?
> 
> What's wrong with
> 
> identifier func_with_errp != object_property_set_link

Nothing wrong, I didn't know this form by the time I wrote this script.

> 
> ?
> 
> [...]
>

[PATCH v7 39/48] pci: pass along the return value of dma_memory_rw

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The nvme device needs to know the return value of dma_memory_rw to pass
block/011 from blktests. So pass it along instead of ignoring it.

There are no existing users of the return value, so this patch should be
safe.

Signed-off-by: Klaus Jensen 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Michael S. Tsirkin 
Acked-by: Keith Busch 
---
 include/hw/pci/pci.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index cfedf5a995d7..da9057b8db97 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -784,8 +784,7 @@ static inline AddressSpace *pci_get_address_space(PCIDevice 
*dev)
 static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
  void *buf, dma_addr_t len, DMADirection dir)
 {
-dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
-return 0;
+return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
 }
 
 static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
-- 
2.26.0

[PATCH v7 37/48] nvme: add nvme_check_rw helper

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c123be10fd0d..ffc49985321b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -651,6 +651,25 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, 
uint64_t slba,
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeNamespace *ns = req->ns;
+size_t len = req->nlb << nvme_ns_lbads(ns);
+uint16_t status;
+
+status = nvme_check_mdts(n, len, req);
+if (status) {
+return status;
+}
+
+status = nvme_check_bounds(n, req->slba, req->nlb, req);
+if (status) {
+return status;
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_rw_cb(NvmeRequest *req, void *opaque)
 {
 NvmeSQueue *sq = req->sq;
@@ -810,12 +829,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 trace_nvme_dev_rw(nvme_req_is_write(req) ? "write" : "read", req->nlb,
   req->nlb << nvme_ns_lbads(req->ns), req->slba);
 
-status = nvme_check_mdts(n, len, req);
-if (status) {
-goto invalid;
-}
-
-status = nvme_check_bounds(n, req->slba, req->nlb, req);
+status = nvme_check_rw(n, req);
 if (status) {
 goto invalid;
 }
-- 
2.26.0

[PATCH v7 33/48] nvme: be consistent about zeros vs zeroes

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The spec in general uses 'zeroes' and not 'zeros'.

Now, according to the Oxford dictionary, 'zeroes' is the action of
zeroing something, i.e. "he zeroes the range" and 'zeros' is the plural
of zero. Thus, Write Zeroes should actually be called Write Zeros, but
alas, let us align with the spec.

Signed-off-by: Klaus Jensen 
---
 block/nvme.c | 4 ++--
 hw/block/nvme.c  | 8 
 include/block/nvme.h | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 7302cc19ade4..304e975e0270 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -465,7 +465,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
   s->page_size / sizeof(uint64_t) * s->page_size);
 
 oncs = le16_to_cpu(idctrl->oncs);
-s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
+s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
 s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
 memset(resp, 0, 4096);
@@ -1119,7 +1119,7 @@ static coroutine_fn int 
nvme_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 NvmeCmd cmd = {
-.opcode = NVME_CMD_WRITE_ZEROS,
+.opcode = NVME_CMD_WRITE_ZEROES,
 .nsid = cpu_to_le32(s->nsid),
 .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0x),
 .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0x),
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d8edd071b261..94d42046149e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -564,7 +564,7 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
 NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
@@ -662,8 +662,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 switch (cmd->opcode) {
 case NVME_CMD_FLUSH:
 return nvme_flush(n, ns, cmd, req);
-case NVME_CMD_WRITE_ZEROS:
-return nvme_write_zeros(n, ns, cmd, req);
+case NVME_CMD_WRITE_ZEROES:
+return nvme_write_zeroes(n, ns, cmd, req);
 case NVME_CMD_WRITE:
 case NVME_CMD_READ:
 return nvme_rw(n, ns, cmd, req);
@@ -2086,7 +2086,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->sqes = (0x6 << 4) | 0x6;
 id->cqes = (0x4 << 4) | 0x4;
 id->nn = cpu_to_le32(n->num_namespaces);
-id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
+id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP);
 
 pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
 pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 88e5385a9d3f..c4c669e32fc4 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -287,7 +287,7 @@ enum NvmeIoCommands {
 NVME_CMD_READ   = 0x02,
 NVME_CMD_WRITE_UNCOR= 0x04,
 NVME_CMD_COMPARE= 0x05,
-NVME_CMD_WRITE_ZEROS= 0x08,
+NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
 };
 
@@ -665,7 +665,7 @@ enum NvmeIdCtrlOncs {
 NVME_ONCS_COMPARE   = 1 << 0,
 NVME_ONCS_WRITE_UNCORR  = 1 << 1,
 NVME_ONCS_DSM   = 1 << 2,
-NVME_ONCS_WRITE_ZEROS   = 1 << 3,
+NVME_ONCS_WRITE_ZEROES  = 1 << 3,
 NVME_ONCS_FEATURES  = 1 << 4,
 NVME_ONCS_RESRVATIONS   = 1 << 5,
 NVME_ONCS_TIMESTAMP = 1 << 6,
-- 
2.26.0

[PATCH v7 27/48] nvme: refactor dma read/write

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Refactor the nvme_dma_{read,write}_prp functions into a common function
taking a DMADirection parameter.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 88 -
 1 file changed, 43 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e41b1337bf7..2ff7dd695cd7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -307,55 +307,50 @@ unmap:
 return status;
 }
 
-static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-   uint64_t prp1, uint64_t prp2)
+static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+ uint64_t prp1, uint64_t prp2, DMADirection dir)
 {
 QEMUSGList qsg;
 QEMUIOVector iov;
 uint16_t status = NVME_SUCCESS;
 
-if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
-return NVME_INVALID_FIELD | NVME_DNR;
+status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n);
+if (status) {
+return status;
 }
+
 if (qsg.nsg > 0) {
-if (dma_buf_write(ptr, len, &qsg)) {
-status = NVME_INVALID_FIELD | NVME_DNR;
+uint64_t residual;
+
+if (dir == DMA_DIRECTION_TO_DEVICE) {
+residual = dma_buf_write(ptr, len, &qsg);
+} else {
+residual = dma_buf_read(ptr, len, &qsg);
 }
-qemu_sglist_destroy(&qsg);
-} else {
-if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
-status = NVME_INVALID_FIELD | NVME_DNR;
-}
-qemu_iovec_destroy(&iov);
-}
-return status;
-}
 
-static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
-uint64_t prp1, uint64_t prp2)
-{
-QEMUSGList qsg;
-QEMUIOVector iov;
-uint16_t status = NVME_SUCCESS;
-
-trace_nvme_dev_dma_read(prp1, prp2);
-
-if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
-return NVME_INVALID_FIELD | NVME_DNR;
-}
-if (qsg.nsg > 0) {
-if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+if (unlikely(residual)) {
 trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
+
 qemu_sglist_destroy(&qsg);
 } else {
-if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
+size_t bytes;
+
+if (dir == DMA_DIRECTION_TO_DEVICE) {
+bytes = qemu_iovec_to_buf(&iov, 0, ptr, len);
+} else {
+bytes = qemu_iovec_from_buf(&iov, 0, ptr, len);
+}
+
+if (unlikely(bytes != len)) {
 trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
+
 qemu_iovec_destroy(&iov);
 }
+
 return status;
 }
 
@@ -788,8 +783,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint8_t rae,
 nvme_clear_events(n, NVME_AER_TYPE_SMART);
 }
 
-return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
- prp2);
+return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
+DMA_DIRECTION_FROM_DEVICE);
 }
 
 static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
@@ -808,8 +803,8 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint32_t buf_len,
 
 trans_len = MIN(sizeof(fw_log) - off, buf_len);
 
-return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1,
- prp2);
+return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
+DMA_DIRECTION_FROM_DEVICE);
 }
 
 static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
@@ -833,7 +828,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint8_t rae,
 
 trans_len = MIN(sizeof(errlog) - off, buf_len);
 
-return nvme_dma_read_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2);
+return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2,
+DMA_DIRECTION_FROM_DEVICE);
 }
 
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
@@ -981,8 +977,8 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
NvmeIdentify *c)
 
 trace_nvme_dev_identify_ctrl();
 
-return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
-prp1, prp2);
+return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
+prp2, DMA_DIRECTION_FROM_DEVICE);
 }
 
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
@@ -1001,8 +997,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify 
*c)
 
 ns = &n->namespaces[nsid - 1];
 
-return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
-prp1, prp2);
+return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
+prp2, DMA_DIRECTION_FROM_DEVICE);
 }

[PATCH v7 48/48] nvme: make lba data size configurable

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme-ns.c | 7 ++-
 hw/block/nvme-ns.h | 4 +++-
 hw/block/nvme.c| 1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index bd64d4a94632..d34b6b2439f1 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -18,7 +18,7 @@ static int nvme_ns_init(NvmeNamespace *ns)
 {
 NvmeIdNs *id_ns = &ns->id_ns;
 
-id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
+id_ns->lbaf[0].ds = ns->params.lbads;
 id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
 /* no thin provisioning */
@@ -78,6 +78,11 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->params.lbads < 9 || ns->params.lbads > 12) {
+error_setg(errp, "unsupported lbads (supported: 9-12)");
+return 1;
+}
+
 return 0;
 }
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 3c3651d485d0..43b78f8b8d9c 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -7,10 +7,12 @@
 
 #define DEFINE_NVME_NS_PROPERTIES(_state, _props) \
 DEFINE_PROP_DRIVE("drive", _state, blk), \
-DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0)
+DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0), \
+DEFINE_PROP_UINT8("lbads", _state, _props.lbads, BDRV_SECTOR_BITS)
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+uint8_t  lbads;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 40a400333828..dd2759a4ce2e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2690,6 +2690,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 if (n->namespace.blk) {
 ns = &n->namespace;
 ns->params.nsid = 1;
+ns->params.lbads = BDRV_SECTOR_BITS;
 
 if (nvme_ns_setup(n, ns, errp)) {
 return;
-- 
2.26.0

[PATCH v7 45/48] nvme: support multiple namespaces

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

This adds support for multiple namespaces by introducing a new 'nvme-ns'
device model. The nvme device creates a bus named from the device name
('id'). The nvme-ns devices then connect to this and registers
themselves with the nvme device.

This changes how an nvme device is created. Example with two namespaces:

  -drive file=nvme0n1.img,if=none,id=disk1
  -drive file=nvme0n2.img,if=none,id=disk2
  -device nvme,serial=deadbeef,id=nvme0
  -device nvme-ns,drive=disk1,bus=nvme0,nsid=1
  -device nvme-ns,drive=disk2,bus=nvme0,nsid=2

The drive property is kept on the nvme device to keep the change
backward compatible, but the property is now optional. Specifying a
drive for the nvme device will always create the namespace with nsid 1.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Reviewed-by: Keith Busch 
---
 hw/block/Makefile.objs |   2 +-
 hw/block/nvme-ns.c | 157 +++
 hw/block/nvme-ns.h |  60 +++
 hw/block/nvme.c| 233 +++--
 hw/block/nvme.h|  47 -
 hw/block/trace-events  |   8 +-
 6 files changed, 396 insertions(+), 111 deletions(-)
 create mode 100644 hw/block/nvme-ns.c
 create mode 100644 hw/block/nvme-ns.h

diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index 4b4a2b338dc4..d9141d6a4b9b 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -7,7 +7,7 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
 common-obj-$(CONFIG_XEN) += xen-block.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
-common-obj-$(CONFIG_NVME_PCI) += nvme.o
+common-obj-$(CONFIG_NVME_PCI) += nvme.o nvme-ns.o
 common-obj-$(CONFIG_SWIM) += swim.o
 
 common-obj-$(CONFIG_SH4) += tc58128.o
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
new file mode 100644
index ..bd64d4a94632
--- /dev/null
+++ b/hw/block/nvme-ns.c
@@ -0,0 +1,157 @@
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qemu/cutils.h"
+#include "qemu/log.h"
+#include "hw/block/block.h"
+#include "hw/pci/pci.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/qdev-core.h"
+
+#include "nvme.h"
+#include "nvme-ns.h"
+
+static int nvme_ns_init(NvmeNamespace *ns)
+{
+NvmeIdNs *id_ns = &ns->id_ns;
+
+id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
+id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
+
+/* no thin provisioning */
+id_ns->ncap = id_ns->nsze;
+id_ns->nuse = id_ns->ncap;
+
+return 0;
+}
+
+static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, NvmeIdCtrl *id,
+Error **errp)
+{
+uint64_t perm, shared_perm;
+
+Error *local_err = NULL;
+int ret;
+
+perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+shared_perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+BLK_PERM_GRAPH_MOD;
+
+ret = blk_set_perm(ns->blk, perm, shared_perm, &local_err);
+if (ret) {
+error_propagate_prepend(errp, local_err,
+"could not set block permissions: ");
+return ret;
+}
+
+ns->size = blk_getlength(ns->blk);
+if (ns->size < 0) {
+error_setg_errno(errp, -ns->size, "could not get blockdev size");
+return -1;
+}
+
+switch (n->conf.wce) {
+case ON_OFF_AUTO_ON:
+n->features.volatile_wc = 1;
+break;
+case ON_OFF_AUTO_OFF:
+n->features.volatile_wc = 0;
+case ON_OFF_AUTO_AUTO:
+n->features.volatile_wc = blk_enable_write_cache(ns->blk);
+break;
+default:
+abort();
+}
+
+blk_set_enable_write_cache(ns->blk, n->features.volatile_wc);
+
+return 0;
+}
+
+static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
+{
+if (!ns->blk) {
+error_setg(errp, "block backend not configured");
+return -1;
+}
+
+return 0;
+}
+
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+{
+if (nvme_ns_check_constraints(ns, errp)) {
+return -1;
+}
+
+if (nvme_ns_init_blk(n, ns, &n->id_ctrl, errp)) {
+return -1;
+}
+
+nvme_ns_init(ns);
+if (nvme_register_namespace(n, ns, errp)) {
+return -1;
+}
+
+return 0;
+}
+
+static void nvme_ns_realize(DeviceState *dev, Error **errp)
+{
+NvmeNamespace *ns = NVME_NS(dev);
+BusState *s = qdev_get_parent_bus(dev);
+NvmeCtrl *n = NVME(s->parent);
+Error *local_err = NULL;
+
+if (nvme_ns_setup(n, ns, &local_err)) {
+error_propagate_prepend(errp, local_err,
+"could not setup namespace: ");
+return;
+}
+}
+
+static Property nvme_ns_props[] = {
+DEFINE_NVME_NS_PROPERTIES(NvmeNamespace, params),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void nvme_ns_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+
+set_bit(DEVICE_CATEGORY_STORAGE,

[PATCH v7 44/48] nvme: refactor identify active namespace id list

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Prepare to support inactive namespaces.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f295f027b8e2..05a6fa334a70 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1549,16 +1549,16 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 uint32_t min_nsid = le32_to_cpu(c->nsid);
 uint32_t *list;
 uint16_t ret;
-int i, j = 0;
+int j = 0;
 
 trace_nvme_dev_identify_nslist(min_nsid);
 
 list = g_malloc0(data_len);
-for (i = 0; i < n->num_namespaces; i++) {
-if (i < min_nsid) {
+for (int i = 1; i <= n->num_namespaces; i++) {
+if (i <= min_nsid) {
 continue;
 }
-list[j++] = cpu_to_le32(i + 1);
+list[j++] = cpu_to_le32(i);
 if (j == data_len / sizeof(uint32_t)) {
 break;
 }
-- 
2.26.0

[PATCH v7 38/48] nvme: use preallocated qsg/iov in nvme_dma_prp

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Since clean up of the request qsg/iov has been moved to the common
nvme_enqueue_req_completion function, there is no need to use a stack
allocated qsg/iov in nvme_dma_prp.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 18 ++
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ffc49985321b..eb15a0bd3cf9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -346,45 +346,39 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
  uint64_t prp1, uint64_t prp2, DMADirection dir,
  NvmeRequest *req)
 {
-QEMUSGList qsg;
-QEMUIOVector iov;
 uint16_t status = NVME_SUCCESS;
 
-status = nvme_map_prp(n, &qsg, &iov, prp1, prp2, len, req);
+status = nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
 if (status) {
 return status;
 }
 
-if (qsg.nsg > 0) {
+if (req->qsg.nsg > 0) {
 uint64_t residual;
 
 if (dir == DMA_DIRECTION_TO_DEVICE) {
-residual = dma_buf_write(ptr, len, &qsg);
+residual = dma_buf_write(ptr, len, &req->qsg);
 } else {
-residual = dma_buf_read(ptr, len, &qsg);
+residual = dma_buf_read(ptr, len, &req->qsg);
 }
 
 if (unlikely(residual)) {
 trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
-
-qemu_sglist_destroy(&qsg);
 } else {
 size_t bytes;
 
 if (dir == DMA_DIRECTION_TO_DEVICE) {
-bytes = qemu_iovec_to_buf(&iov, 0, ptr, len);
+bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len);
 } else {
-bytes = qemu_iovec_from_buf(&iov, 0, ptr, len);
+bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len);
 }
 
 if (unlikely(bytes != len)) {
 trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
-
-qemu_iovec_destroy(&iov);
 }
 
 return status;
-- 
2.26.0

[PATCH v7 42/48] nvme: add support for scatter gather lists

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

For now, support the Data Block, Segment and Last Segment descriptor
types.

See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c   | 332 ++
 hw/block/trace-events |   4 +
 2 files changed, 278 insertions(+), 58 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5140bc32913d..a19085e605e7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -358,13 +358,263 @@ unmap:
 return status;
 }
 
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir,
+/*
+ * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
+ * number of bytes mapped in len.
+ */
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+  QEMUIOVector *iov,
+  NvmeSglDescriptor *segment, uint64_t nsgld,
+  size_t *len, NvmeRequest *req)
+{
+dma_addr_t addr, trans_len;
+uint32_t dlen;
+uint16_t status;
+
+for (int i = 0; i < nsgld; i++) {
+uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+switch (type) {
+case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+break;
+case NVME_SGL_DESCR_TYPE_SEGMENT:
+case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+default:
+return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+}
+
+dlen = le32_to_cpu(segment[i].len);
+if (!dlen) {
+continue;
+}
+
+if (*len == 0) {
+/*
+ * All data has been mapped, but the SGL contains additional
+ * segments and/or descriptors. The controller might accept
+ * ignoring the rest of the SGL.
+ */
+uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+break;
+}
+
+trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+trans_len = MIN(*len, dlen);
+addr = le64_to_cpu(segment[i].addr);
+
+if (UINT64_MAX - addr < dlen) {
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+if (status) {
+return status;
+}
+
+*len -= trans_len;
+}
+
+return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ NvmeSglDescriptor sgl, size_t len,
  NvmeRequest *req)
+{
+/*
+ * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+ * dynamically allocating a potentially huge SGL. The spec allows the SGL
+ * to be larger (as in number of bytes required to describe the SGL
+ * descriptors and segment chain) than the command transfer size, so it is
+ * not bounded by MDTS.
+ */
+const int SEG_CHUNK_SIZE = 256;
+
+NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+uint64_t nsgld;
+uint32_t seg_len;
+uint16_t status;
+bool sgl_in_cmb = false;
+hwaddr addr;
+int ret;
+
+sgld = &sgl;
+addr = le64_to_cpu(sgl.addr);
+
+trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
+   len);
+
+/*
+ * If the entire transfer can be described with a single data block it can
+ * be mapped directly.
+ */
+if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
+if (status) {
+goto unmap;
+}
+
+goto out;
+}
+
+/*
+ * If the segment is located in the CMB, the submission queue of the
+ * request must also reside there.
+ */
+if (nvme_addr_is_cmb(n, addr)) {
+if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+}
+
+sgl_in_cmb = true;
+}
+
+for (;;) {
+switch (NVME_SGL_TYPE(sgld->type)) {
+case NVME_SGL_DESCR_TYPE_SEGMENT:
+case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+break;
+default:
+return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+}
+
+seg_len = le32_to_cpu(sgld->len);
+
+/* check the length of the (Last) Segment descriptor */
+if (!seg_len || seg_len & 0xf) {
+return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+}
+
+if (UINT64_MAX - addr < seg_len) {
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+nsgld = seg_len / sizeof(NvmeSglDescriptor);
+
+whil

[PATCH v7 40/48] nvme: handle dma errors

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Handling DMA errors gracefully is required for the device to pass the
block/011 test ("disable PCI device while doing I/O") in the blktests
suite.

With this patch the device passes the test by retrying "critical"
transfers (posting of completion entries and processing of submission
queue entries).

If DMA errors occur at any other point in the execution of the command
(say, while mapping the PRPs), the command is aborted with a Data
Transfer Error status code.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 45 ---
 hw/block/trace-events |  2 ++
 include/block/nvme.h  |  2 +-
 3 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index eb15a0bd3cf9..6dcd9c4b4cd0 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -74,14 +74,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
addr)
 return addr >= low && addr < hi;
 }
 
-static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
+static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
-return;
+return 0;
 }
 
-pci_dma_read(&n->parent_obj, addr, buf, size);
+return pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
@@ -185,7 +185,7 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector 
*iov, hwaddr addr,
 }
 
 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
-return NVME_DATA_TRAS_ERROR;
+return NVME_DATA_TRANSFER_ERROR;
 }
 
 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
@@ -238,6 +238,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 int num_prps = (len >> n->page_bits) + 1;
 uint16_t status;
 bool prp_list_in_cmb = false;
+int ret;
 
 trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
num_prps);
@@ -277,7 +278,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
+ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
+if (ret) {
+trace_nvme_dev_err_addr_read(prp2);
+status = NVME_DATA_TRANSFER_ERROR;
+goto unmap;
+}
 while (len != 0) {
 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
@@ -296,8 +302,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 i = 0;
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-nvme_addr_read(n, prp_ent, (void *)prp_list,
-prp_trans);
+ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
+ prp_trans);
+if (ret) {
+trace_nvme_dev_err_addr_read(prp_ent);
+status = NVME_DATA_TRANSFER_ERROR;
+goto unmap;
+}
 prp_ent = le64_to_cpu(prp_list[i]);
 }
 
@@ -502,6 +513,7 @@ static void nvme_post_cqes(void *opaque)
 NvmeCQueue *cq = opaque;
 NvmeCtrl *n = cq->ctrl;
 NvmeRequest *req, *next;
+int ret;
 
 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
 NvmeSQueue *sq;
@@ -511,15 +523,21 @@ static void nvme_post_cqes(void *opaque)
 break;
 }
 
-QTAILQ_REMOVE(&cq->req_list, req, entry);
 sq = req->sq;
 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
 req->cqe.sq_id = cpu_to_le16(sq->sqid);
 req->cqe.sq_head = cpu_to_le16(sq->head);
 addr = cq->dma_addr + cq->tail * n->cqe_size;
+ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
+sizeof(req->cqe));
+if (ret) {
+trace_nvme_dev_err_addr_write(addr);
+timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+  500 * SCALE_MS);
+break;
+}
+QTAILQ_REMOVE(&cq->req_list, req, entry);
 nvme_inc_cq_tail(cq);
-pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
-sizeof(req->cqe));
 nvme_req_clear(req);
 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
 }
@@ -1664,7 +1682,12 @@ static void nvme_process_sq(void *opaque)
 
 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {

[PATCH v7 29/48] nvme: add request mapping helper

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Introduce the nvme_map helper to remove some noise in the main nvme_rw
function.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 66f92f6f6f2d..1f4ce48b9cbb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -359,6 +359,15 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return status;
 }
 
+static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
+ QEMUIOVector *iov, size_t len, NvmeRequest *req)
+{
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
+
+return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -546,8 +555,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
 uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-uint64_t prp1 = le64_to_cpu(rw->dptr.prp1);
-uint64_t prp2 = le64_to_cpu(rw->dptr.prp2);
 
 uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
@@ -564,7 +571,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 return NVME_LBA_RANGE | NVME_DNR;
 }
 
-if (nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, data_size, req)) {
+if (nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req)) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 return NVME_INVALID_FIELD | NVME_DNR;
 }
-- 
2.26.0

[PATCH v7 24/48] nvme: add mapping helpers

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add nvme_map_addr, nvme_map_addr_cmb and nvme_addr_to_cmb helpers and
use them in nvme_map_prp.

This fixes a bug where in the case of a CMB transfer, the device would
map to the buffer with a wrong length.

Fixes: b2b2b67a00574 ("nvme: Add support for Read Data and Write Data in CMBs.")
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 105 +++---
 hw/block/trace-events |   1 +
 2 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1d4705693287..b62b053d7c38 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -59,6 +59,11 @@
 
 static void nvme_process_sq(void *opaque);
 
+static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
+{
+return &n->cmbuf[addr - n->ctrl_mem.addr];
+}
+
 static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
 hwaddr low = n->ctrl_mem.addr;
@@ -70,7 +75,7 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
+memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 return;
 }
 
@@ -153,29 +158,87 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 }
 }
 
+static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
+  size_t len)
+{
+if (!len) {
+return NVME_SUCCESS;
+}
+
+if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
+return NVME_DATA_TRAS_ERROR;
+}
+
+qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
+
+return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+  hwaddr addr, size_t len)
+{
+if (!len) {
+return NVME_SUCCESS;
+}
+
+if (nvme_addr_is_cmb(n, addr)) {
+if (qsg && qsg->sg) {
+return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+}
+
+assert(iov);
+
+if (!iov->iov) {
+qemu_iovec_init(iov, 1);
+}
+
+return nvme_map_addr_cmb(n, iov, addr, len);
+}
+
+if (iov && iov->iov) {
+return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+}
+
+assert(qsg);
+
+if (!qsg->sg) {
+pci_dma_sglist_init(qsg, &n->parent_obj, 1);
+}
+
+qemu_sglist_add(qsg, addr, len);
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
+uint16_t status;
 
 if (unlikely(!prp1)) {
 trace_nvme_dev_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
-qsg->nsg = 0;
+}
+
+if (nvme_addr_is_cmb(n, prp1)) {
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
 } else {
 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
-qemu_sglist_add(qsg, prp1, trans_len);
 }
+
+status = nvme_map_addr(n, qsg, iov, prp1, trans_len);
+if (status) {
+goto unmap;
+}
+
 len -= trans_len;
 if (len) {
 if (unlikely(!prp2)) {
 trace_nvme_dev_err_invalid_prp2_missing();
+status = NVME_INVALID_FIELD | NVME_DNR;
 goto unmap;
 }
 if (len > n->page_size) {
@@ -192,6 +255,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (i == n->max_prp_ents - 1 && len > n->page_size) {
 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
+status = NVME_INVALID_FIELD | NVME_DNR;
 goto unmap;
 }
 
@@ -205,14 +269,14 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
 
 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
 trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
+status = NVME_INVALID_FIELD | NVME_DNR;
 goto unmap;
 }
 
 trans_len = MIN(len, n->page_size);
-if (qsg->nsg){
-qemu_sglist_add(qsg, prp_ent, trans_len);
-} else {
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len);
+if (st

[PATCH v7 31/48] nvme: refactor request bounds checking

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e5e99644a4e..7528d75905d4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -499,6 +499,20 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t 
event_type)
 }
 }
 
+static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
+ uint64_t slba, uint32_t nlb,
+ NvmeRequest *req)
+{
+uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
+
+if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
+trace_nvme_dev_err_invalid_lba_range(slba, nlb, nsze);
+return NVME_LBA_RANGE | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
 NvmeRequest *req = opaque;
@@ -546,12 +560,13 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
NvmeNamespace *ns, NvmeCmd *cmd,
 uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
 uint64_t offset = slba << data_shift;
 uint32_t count = nlb << data_shift;
+uint16_t status;
 
 trace_nvme_dev_write_zeroes(nvme_cid(req), slba, nlb);
 
-if (unlikely(slba + nlb > ns->id_ns.nsze)) {
-trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return NVME_LBA_RANGE | NVME_DNR;
+status = nvme_check_bounds(n, ns, slba, nlb, req);
+if (status) {
+return status;
 }
 
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
@@ -574,13 +589,14 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 uint64_t data_offset = slba << data_shift;
 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+uint16_t status;
 
 trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
 
-if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
+status = nvme_check_bounds(n, ns, slba, nlb, req);
+if (status) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-return NVME_LBA_RANGE | NVME_DNR;
+return status;
 }
 
 if (nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req)) {
-- 
2.26.0

[PATCH v7 34/48] nvme: refactor NvmeRequest

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add a reference to the NvmeNamespace and move clearing of the structure
from "clear before use" to "clear after use".

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 38 +-
 hw/block/nvme.h |  1 +
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 94d42046149e..a7c5f93fc545 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -159,6 +159,12 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 }
 }
 
+static void nvme_req_clear(NvmeRequest *req)
+{
+req->ns = NULL;
+memset(&req->cqe, 0x0, sizeof(req->cqe));
+}
+
 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
   size_t len)
 {
@@ -404,6 +410,7 @@ static void nvme_post_cqes(void *opaque)
 nvme_inc_cq_tail(cq);
 pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
 sizeof(req->cqe));
+nvme_req_clear(req);
 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
 }
 if (cq->tail != cq->head) {
@@ -513,10 +520,10 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, 
size_t len,
 return NVME_SUCCESS;
 }
 
-static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
- uint64_t slba, uint32_t nlb,
- NvmeRequest *req)
+static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
+ uint32_t nlb, NvmeRequest *req)
 {
+NvmeNamespace *ns = req->ns;
 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
 
 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
@@ -554,8 +561,7 @@ static void nvme_rw_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(cq, req);
 }
 
-static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-NvmeRequest *req)
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_FLUSH);
@@ -564,10 +570,10 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeCmd *cmd,
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-NvmeRequest *req)
+static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+NvmeNamespace *ns = req->ns;
 const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
 uint64_t slba = le64_to_cpu(rw->slba);
@@ -578,7 +584,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeNamespace *ns, NvmeCmd *cmd,
 
 trace_nvme_dev_write_zeroes(nvme_cid(req), slba, nlb);
 
-status = nvme_check_bounds(n, ns, slba, nlb, req);
+status = nvme_check_bounds(n, slba, nlb, req);
 if (status) {
 return status;
 }
@@ -590,10 +596,10 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeNamespace *ns, NvmeCmd *cmd,
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
-NvmeRequest *req)
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+NvmeNamespace *ns = req->ns;
 uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
 
@@ -613,7 +619,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 return status;
 }
 
-status = nvme_check_bounds(n, ns, slba, nlb, req);
+status = nvme_check_bounds(n, slba, nlb, req);
 if (status) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 return status;
@@ -647,7 +653,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
-NvmeNamespace *ns;
 uint32_t nsid = le32_to_cpu(cmd->nsid);
 
 trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
@@ -658,15 +663,15 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return NVME_INVALID_NSID | NVME_DNR;
 }
 
-ns = &n->namespaces[nsid - 1];
+req->ns = &n->namespaces[nsid - 1];
 switch (cmd->opcode) {
 case NVME_CMD_FLUSH:
-return nvme_flush(n, ns, cmd, req);
+return nvme_flush(n, cmd, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, ns, cmd, req);
+return nvme_write_zeroes(n, cmd, req);
 case NVME_CMD_WRITE:
 case NVME_CMD_READ:
-return nvme_rw(n, ns, cmd, req);
+return nvme_rw(n, cmd, req);
 default:
 trace_nvme_dev_err_invalid_opc(cmd->opcode);
 return NVME_INVALID_OPCODE | NVME_DNR;
@@ -1463,7 +1468,6 @@ static void nvme_process_sq(void *opaque)
 req = QTAILQ_FIRST(&sq->req_list);
 QTAILQ_REMOV

[PATCH v7 43/48] nvme: add support for sgl bit bucket descriptor

2020-04-14 Thread Klaus Jensen

From: Gollu Appalanaidu 

This adds support for SGL descriptor type 0x1 (bit bucket descriptor).
See the NVM Express v1.3d specification, Section 4.4 ("Scatter Gather
List (SGL)").

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a19085e605e7..f295f027b8e2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -375,6 +375,10 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 uint8_t type = NVME_SGL_TYPE(segment[i].type);
 
 switch (type) {
+case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
+if (nvme_req_is_write(req)) {
+continue;
+}
 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
 break;
 case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -385,6 +389,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 }
 
 dlen = le32_to_cpu(segment[i].len);
+
 if (!dlen) {
 continue;
 }
@@ -405,6 +410,11 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 }
 
 trans_len = MIN(*len, dlen);
+
+if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
+goto next;
+}
+
 addr = le64_to_cpu(segment[i].addr);
 
 if (UINT64_MAX - addr < dlen) {
@@ -416,6 +426,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 return status;
 }
 
+next:
 *len -= trans_len;
 }
 
@@ -486,7 +497,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 seg_len = le32_to_cpu(sgld->len);
 
 /* check the length of the (Last) Segment descriptor */
-if (!seg_len || seg_len & 0xf) {
+if ((!seg_len || seg_len & 0xf) &&
+(NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
 }
 
@@ -523,19 +535,27 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 
 last_sgld = &segment[nsgld - 1];
 
-/* if the segment ends with a Data Block, then we are done */
-if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+/*
+ * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
+ * then we are done.
+ */
+switch (NVME_SGL_TYPE(last_sgld->type)) {
+case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
 status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
 if (status) {
 goto unmap;
 }
 
 goto out;
+
+default:
+break;
 }
 
 /*
- * If the last descriptor was not a Data Block, then the current
- * segment must not be a Last Segment.
+ * If the last descriptor was not a Data Block or Bit Bucket, then the
+ * current segment must not be a Last Segment.
  */
 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
@@ -2537,7 +2557,8 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->nn = cpu_to_le32(n->num_namespaces);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP);
 
-id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORTED_NO_ALIGNMENT);
+id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORTED_NO_ALIGNMENT |
+   NVME_CTRL_SGLS_BITBUCKET);
 
 pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
 pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
-- 
2.26.0

[PATCH v7 26/48] nvme: remove redundant has_sg member

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Remove the has_sg member from NvmeRequest since it's redundant.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 11 ++-
 hw/block/nvme.h |  1 -
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c9f7badd5a15..3e41b1337bf7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -494,16 +494,20 @@ static void nvme_rw_cb(void *opaque, int ret)
 block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
 req->status = NVME_INTERNAL_DEV_ERROR;
 }
-if (req->has_sg) {
+
+if (req->qsg.nalloc) {
 qemu_sglist_destroy(&req->qsg);
 }
+if (req->iov.nalloc) {
+qemu_iovec_destroy(&req->iov);
+}
+
 nvme_enqueue_req_completion(cq, req);
 }
 
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
 NvmeRequest *req)
 {
-req->has_sg = false;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_FLUSH);
 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
@@ -529,7 +533,6 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeCmd *cmd,
 return NVME_LBA_RANGE | NVME_DNR;
 }
 
-req->has_sg = false;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_WRITE);
 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
@@ -567,7 +570,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 }
 
 if (req->qsg.nsg > 0) {
-req->has_sg = true;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
  acct);
 req->aiocb = is_write ?
@@ -576,7 +578,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
  nvme_rw_cb, req);
 } else {
-req->has_sg = false;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
  acct);
 req->aiocb = is_write ?
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index f72ffddae160..a946ae88d817 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -29,7 +29,6 @@ typedef struct NvmeRequest {
 struct NvmeSQueue   *sq;
 BlockAIOCB  *aiocb;
 uint16_tstatus;
-boolhas_sg;
 NvmeCqe cqe;
 BlockAcctCookie acct;
 QEMUSGList  qsg;
-- 
2.26.0

[PATCH v7 35/48] nvme: remove NvmeCmd parameter

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Keep a copy of the raw nvme command in the NvmeRequest and remove the
now redundant NvmeCmd parameter.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 177 +---
 hw/block/nvme.h |   1 +
 2 files changed, 93 insertions(+), 85 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a7c5f93fc545..846aa31eaae9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -378,9 +378,10 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return status;
 }
 
-static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
- QEMUIOVector *iov, size_t len, NvmeRequest *req)
+static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ size_t len, NvmeRequest *req)
 {
+NvmeCmd *cmd = &req->cmd;
 uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
 uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
 
@@ -561,7 +562,7 @@ static void nvme_rw_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(cq, req);
 }
 
-static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_FLUSH);
@@ -570,9 +571,9 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
-NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
 const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
@@ -596,9 +597,9 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 {
-NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
 uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
@@ -625,7 +626,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return status;
 }
 
-if (nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req)) {
+if (nvme_map(n, &req->qsg, &req->iov, data_size, req)) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
 return NVME_INVALID_FIELD | NVME_DNR;
 }
@@ -651,12 +652,12 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return NVME_NO_COMPLETE;
 }
 
-static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
-uint32_t nsid = le32_to_cpu(cmd->nsid);
+uint32_t nsid = le32_to_cpu(req->cmd.nsid);
 
 trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
-  cmd->opcode);
+  req->cmd.opcode);
 
 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
@@ -664,16 +665,16 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 }
 
 req->ns = &n->namespaces[nsid - 1];
-switch (cmd->opcode) {
+switch (req->cmd.opcode) {
 case NVME_CMD_FLUSH:
-return nvme_flush(n, cmd, req);
+return nvme_flush(n, req);
 case NVME_CMD_WRITE_ZEROES:
-return nvme_write_zeroes(n, cmd, req);
+return nvme_write_zeroes(n, req);
 case NVME_CMD_WRITE:
 case NVME_CMD_READ:
-return nvme_rw(n, cmd, req);
+return nvme_rw(n, req);
 default:
-trace_nvme_dev_err_invalid_opc(cmd->opcode);
+trace_nvme_dev_err_invalid_opc(req->cmd.opcode);
 return NVME_INVALID_OPCODE | NVME_DNR;
 }
 }
@@ -689,10 +690,10 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 }
 }
 
-static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
+static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
 {
-NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
-NvmeRequest *req, *next;
+NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
+NvmeRequest *r, *next;
 NvmeSQueue *sq;
 NvmeCQueue *cq;
 uint16_t qid = le16_to_cpu(c->qid);
@@ -706,19 +707,19 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
 
 sq = n->sq[qid];
 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
-req = QTAILQ_FIRST(&sq->out_req_list);
-assert(req->aiocb);
-blk_aio_cancel(req->aiocb);
+r = QTAILQ_FIRST(&sq->out_req_list);
+assert(r->aiocb);
+blk_aio_cancel(r->aiocb);
 }
 if (!nvme_check_cqid(n, sq->cqid)) {
 cq = n->cq[sq->cqid];
 QTAILQ_REMOVE(&cq->sq_list, sq,

[PATCH v7 25/48] nvme: replace dma_acct with blk_acct equivalent

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The QSG isn't always initialized, so accounting could be wrong. Issue a
call to blk_acct_start instead with the size taken from the QSG or IOV
depending on the kind of I/O.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b62b053d7c38..c9f7badd5a15 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -566,9 +566,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
 if (req->qsg.nsg > 0) {
 req->has_sg = true;
+block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
+ acct);
 req->aiocb = is_write ?
 dma_blk_write(n->conf.blk, &req->qsg, data_offset, 
BDRV_SECTOR_SIZE,
   nvme_rw_cb, req) :
@@ -576,6 +577,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
  nvme_rw_cb, req);
 } else {
 req->has_sg = false;
+block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
+ acct);
 req->aiocb = is_write ?
 blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
 req) :
-- 
2.26.0

[PATCH v7 30/48] nvme: verify validity of prp lists in the cmb

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Before this patch the device already supported this, but it did not
check for the validity of it nor announced the support in the LISTS
field.

If some of the PRPs in a PRP list are in the CMB, then ALL entries must
be there. This patch makes sure that is verified as well as properly
announcing support for PRP lists in the CMB.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1f4ce48b9cbb..3e5e99644a4e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -218,6 +218,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 uint16_t status;
+bool prp_list_in_cmb = false;
 
 trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
num_prps);
@@ -245,11 +246,16 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 status = NVME_INVALID_FIELD | NVME_DNR;
 goto unmap;
 }
+
 if (len > n->page_size) {
 uint64_t prp_list[n->max_prp_ents];
 uint32_t nents, prp_trans;
 int i = 0;
 
+if (nvme_addr_is_cmb(n, prp2)) {
+prp_list_in_cmb = true;
+}
+
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
 nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
@@ -263,6 +269,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 goto unmap;
 }
 
+if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) {
+status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
+goto unmap;
+}
+
 i = 0;
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
@@ -282,6 +293,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 if (status) {
 goto unmap;
 }
+
 len -= trans_len;
 i++;
 }
@@ -1953,7 +1965,7 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 
 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
-NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
-- 
2.26.0

[PATCH v7 23/48] nvme: memset preallocated requests structures

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

This is preparatory to subsequent patches that change how QSGs/IOVs are
handled. It is important that the qsg and iov members of the NvmeRequest
are initially zeroed.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 59935d4641a6..1d4705693287 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -604,7 +604,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
uint64_t dma_addr,
 sq->size = size;
 sq->cqid = cqid;
 sq->head = sq->tail = 0;
-sq->io_req = g_new(NvmeRequest, sq->size);
+sq->io_req = g_new0(NvmeRequest, sq->size);
 
 QTAILQ_INIT(&sq->req_list);
 QTAILQ_INIT(&sq->out_req_list);
-- 
2.26.0

[PATCH v7 16/48] nvme: additional tracing

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Also, streamline nvme_identify_ns and nvme_identify_ns_list. They do not
need to repeat the command, it is already in the trace name.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 20 
 hw/block/trace-events | 13 +++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e448a1c85cb9..7094767eeccb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -320,6 +320,8 @@ static void nvme_post_cqes(void *opaque)
 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 {
 assert(cq->cqid == req->sq->cqid);
+trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
+  req->status);
 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
@@ -411,6 +413,8 @@ static void nvme_rw_cb(void *opaque, int ret)
 NvmeCtrl *n = sq->ctrl;
 NvmeCQueue *cq = n->cq[sq->cqid];
 
+trace_nvme_dev_rw_cb(nvme_cid(req));
+
 if (!ret) {
 block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
 req->status = NVME_SUCCESS;
@@ -446,6 +450,8 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeCmd *cmd,
 uint64_t offset = slba << data_shift;
 uint32_t count = nlb << data_shift;
 
+trace_nvme_dev_write_zeroes(nvme_cid(req), slba, nlb);
+
 if (unlikely(slba + nlb > ns->id_ns.nsze)) {
 trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
 return NVME_LBA_RANGE | NVME_DNR;
@@ -513,6 +519,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 NvmeNamespace *ns;
 uint32_t nsid = le32_to_cpu(cmd->nsid);
 
+trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
+  cmd->opcode);
+
 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
 return NVME_INVALID_NSID | NVME_DNR;
@@ -1200,6 +1209,9 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
+trace_nvme_dev_admin_cmd(nvme_cid(req), le16_to_cpu(req->sq->sqid),
+ cmd->opcode);
+
 switch (cmd->opcode) {
 case NVME_ADM_CMD_DELETE_SQ:
 return nvme_del_sq(n, cmd);
@@ -1525,6 +1537,8 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
 uint8_t *ptr = (uint8_t *)&n->bar;
 uint64_t val = 0;
 
+trace_nvme_dev_mmio_read(addr);
+
 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 NVME_GUEST_ERR(nvme_dev_ub_mmiord_misaligned32,
"MMIO read not 32-bit aligned,"
@@ -1599,6 +1613,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 return;
 }
 
+trace_nvme_dev_mmio_doorbell_cq(cq->cqid, new_head);
+
 start_sqs = nvme_cq_full(cq) ? 1 : 0;
 cq->head = new_head;
 if (start_sqs) {
@@ -1651,6 +1667,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 return;
 }
 
+trace_nvme_dev_mmio_doorbell_sq(sq->sqid, new_tail);
+
 sq->tail = new_tail;
 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 }
@@ -1659,6 +1677,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
 unsigned size)
 {
+trace_nvme_dev_mmio_write(addr, data);
+
 NvmeCtrl *n = (NvmeCtrl *)opaque;
 if (addr < sizeof(n->bar)) {
 nvme_write_bar(n, addr, data, size);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index b6fde13419bf..659091fc2fed 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -33,14 +33,18 @@ nvme_dev_irq_msix(uint32_t vector) "raising MSI-X IRQ 
vector %u"
 nvme_dev_irq_pin(void) "pulsing IRQ pin"
 nvme_dev_irq_masked(void) "IRQ is masked"
 nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" 
prp2=0x%"PRIx64""
+nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) 
"cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
+nvme_dev_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" 
sqid %"PRIu16" opc 0x%"PRIx8""
 nvme_dev_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, 
uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_dev_rw_cb(uint16_t cid) "cid %"PRIu16""
+nvme_dev_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid 
%"PRIu16" slba %"PRIu64" nlb %"PRIu32""
 nvme_dev_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t 
qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", 
sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 nvme_d

[PATCH v7 22/48] nvme: bump supported version to v1.3

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d5c293476411..59935d4641a6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -44,6 +44,7 @@
 #include "trace.h"
 #include "nvme.h"
 
+#define NVME_SPEC_VER 0x00010300
 #define NVME_CMB_BIR 2
 #define NVME_TEMPERATURE 0x143
 #define NVME_TEMPERATURE_WARNING 0x157
@@ -1913,6 +1914,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->ieee[0] = 0x00;
 id->ieee[1] = 0x02;
 id->ieee[2] = 0xb3;
+id->ver = cpu_to_le32(NVME_SPEC_VER);
 id->oacs = cpu_to_le16(0);
 
 /*
@@ -1957,7 +1959,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
 
-n->bar.vs = 0x00010200;
+n->bar.vs = NVME_SPEC_VER;
 n->bar.intmc = n->bar.intms = 0;
 }
 
-- 
2.26.0

[PATCH v7 32/48] nvme: add check for mdts

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add 'mdts' device parameter to control the Maximum Data Transfer Size of
the controller and check that it is respected.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 29 -
 hw/block/nvme.h   |  4 +++-
 hw/block/trace-events |  1 +
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7528d75905d4..d8edd071b261 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,7 +19,8 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
- *  max_ioqpairs=
+ *  max_ioqpairs=, \
+ *  mdts=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -499,6 +500,19 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t 
event_type)
 }
 }
 
+static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
+   NvmeRequest *req)
+{
+uint8_t mdts = n->params.mdts;
+
+if (mdts && len > n->page_size << mdts) {
+trace_nvme_dev_err_mdts(nvme_cid(req), n->page_size << mdts, len);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
 static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
  uint64_t slba, uint32_t nlb,
  NvmeRequest *req)
@@ -593,6 +607,12 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 
 trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
 
+status = nvme_check_mdts(n, data_size, req);
+if (status) {
+block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+return status;
+}
+
 status = nvme_check_bounds(n, ns, slba, nlb, req);
 if (status) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
@@ -884,6 +904,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 uint32_t numdl, numdu;
 uint64_t off, lpol, lpou;
 size_t   len;
+uint16_t status;
 
 numdl = (dw10 >> 16);
 numdu = (dw11 & 0x);
@@ -899,6 +920,11 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 
 trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
 
+status = nvme_check_mdts(n, len, req);
+if (status) {
+return status;
+}
+
 switch (lid) {
 case NVME_LOG_ERROR_INFO:
 return nvme_error_info(n, cmd, rae, len, off, req);
@@ -2033,6 +2059,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->ieee[0] = 0x00;
 id->ieee[1] = 0x02;
 id->ieee[2] = 0xb3;
+id->mdts = params->mdts;
 id->ver = cpu_to_le32(NVME_SPEC_VER);
 id->oacs = cpu_to_le16(0);
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index a946ae88d817..a25568723d0d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,7 +9,8 @@
 DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 0), \
 DEFINE_PROP_UINT32("max_ioqpairs", _state, _props.max_ioqpairs, 64), \
 DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3), \
-DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64)
+DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64), \
+DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7)
 
 typedef struct NvmeParams {
 char *serial;
@@ -18,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t cmb_size_mb;
 uint8_t  aerl;
 uint32_t aer_max_queued;
+uint8_t  mdts;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
diff --git a/hw/block/trace-events b/hw/block/trace-events
index e050af87ece4..291422a5b77d 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -83,6 +83,7 @@ nvme_dev_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) 
"cqid %"PRIu16" new_
 nvme_dev_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" 
new_tail %"PRIu16""
 
 # nvme traces for error conditions
+nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts 
%"PRIu64" len %"PRIu64""
 nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
 nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or 
not page aligned: 0x%"PRIx64""
 nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 
0x%"PRIx64""
-- 
2.26.0

[PATCH v7 19/48] nvme: support identify namespace descriptor list

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Since we are not providing the NGUID or EUI64 fields, we must support
the Namespace UUID. We do not have any way of storing a persistent
unique identifier, so conjure up a UUID that is just the namespace id.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 39 +++
 hw/block/trace-events |  1 +
 2 files changed, 40 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d4622278450e..f40bc861facc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -949,6 +949,43 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeIdentify *c)
 return ret;
 }
 
+static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c)
+{
+uint32_t nsid = le32_to_cpu(c->nsid);
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+
+uint8_t list[NVME_IDENTIFY_DATA_SIZE];
+
+struct data {
+struct {
+NvmeIdNsDescr hdr;
+uint8_t v[16];
+} uuid;
+};
+
+struct data *ns_descrs = (struct data *)list;
+
+trace_nvme_dev_identify_ns_descr_list(nsid);
+
+if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
+return NVME_INVALID_NSID | NVME_DNR;
+}
+
+/*
+ * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
+ * structure, a Namespace UUID (nidt = 0x3) must be reported in the
+ * Namespace Identification Descriptor. Add a very basic Namespace UUID
+ * here.
+ */
+ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
+ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
+stl_be_p(&ns_descrs->uuid.v, nsid);
+
+return nvme_dma_read_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2);
+}
+
 static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 {
 NvmeIdentify *c = (NvmeIdentify *)cmd;
@@ -960,6 +997,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 return nvme_identify_ctrl(n, c);
 case NVME_ID_CNS_NS_ACTIVE_LIST:
 return nvme_identify_nslist(n, c);
+case NVME_ID_CNS_NS_DESCR_LIST:
+return nvme_identify_ns_descr_list(n, c);
 default:
 trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));
 return NVME_INVALID_FIELD | NVME_DNR;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index fb5b26f6f5f6..7ecd47131ac2 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -45,6 +45,7 @@ nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, 
cqid=%"PRIu16""
 nvme_dev_identify_ctrl(void) "identify controller"
 nvme_dev_identify_ns(uint32_t ns) "nsid %"PRIu32""
 nvme_dev_identify_nslist(uint32_t ns) "nsid %"PRIu32""
+nvme_dev_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
 nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
 nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" fid 
0x%"PRIx32" val 0x%"PRIx32""
 nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write 
cache, result=%s"
-- 
2.26.0

[PATCH v7 13/48] nvme: add support for the get log page command

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add support for the Get Log Page command and basic implementations of
the mandatory Error Information, SMART / Health Information and Firmware
Slot Information log pages.

In violation of the specification, the SMART / Health Information log
page does not persist information over the lifetime of the controller
because the device has no place to store such persistent state.

Note that the LPA field in the Identify Controller data structure
intentionally has bit 0 cleared because there is no namespace specific
information in the SMART / Health information log page.

Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
Section 5.10 ("Get Log Page command").

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c   | 138 +-
 hw/block/nvme.h   |  11 
 hw/block/trace-events |   2 +
 3 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e777cc9075c1..76acc112fa7e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -569,6 +569,138 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
+uint64_t off, NvmeRequest *req)
+{
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
+uint32_t nsid = le32_to_cpu(cmd->nsid);
+
+uint32_t trans_len;
+time_t current_ms;
+uint64_t units_read = 0, units_written = 0;
+uint64_t read_commands = 0, write_commands = 0;
+NvmeSmartLog smart;
+BlockAcctStats *s;
+
+if (nsid && nsid != 0x) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+s = blk_get_stats(n->conf.blk);
+
+units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
+units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+read_commands = s->nr_ops[BLOCK_ACCT_READ];
+write_commands = s->nr_ops[BLOCK_ACCT_WRITE];
+
+if (off > sizeof(smart)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+trans_len = MIN(sizeof(smart) - off, buf_len);
+
+memset(&smart, 0x0, sizeof(smart));
+
+smart.data_units_read[0] = cpu_to_le64(units_read / 1000);
+smart.data_units_written[0] = cpu_to_le64(units_written / 1000);
+smart.host_read_commands[0] = cpu_to_le64(read_commands);
+smart.host_write_commands[0] = cpu_to_le64(write_commands);
+
+smart.temperature[0] = n->temperature & 0xff;
+smart.temperature[1] = (n->temperature >> 8) & 0xff;
+
+if ((n->temperature > n->features.temp_thresh_hi) ||
+(n->temperature < n->features.temp_thresh_low)) {
+smart.critical_warning |= NVME_SMART_TEMPERATURE;
+}
+
+current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+smart.power_on_hours[0] =
+cpu_to_le64current_ms - n->starttime_ms) / 1000) / 60) / 60);
+
+return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
+ prp2);
+}
+
+static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
+ uint64_t off, NvmeRequest *req)
+{
+uint32_t trans_len;
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
+NvmeFwSlotInfoLog fw_log;
+
+if (off > sizeof(fw_log)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+memset(&fw_log, 0, sizeof(NvmeFwSlotInfoLog));
+
+trans_len = MIN(sizeof(fw_log) - off, buf_len);
+
+return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1,
+ prp2);
+}
+
+static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
+uint64_t off, NvmeRequest *req)
+{
+uint32_t trans_len;
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
+NvmeErrorLog errlog;
+
+if (off > sizeof(errlog)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+memset(&errlog, 0x0, sizeof(errlog));
+
+trans_len = MIN(sizeof(errlog) - off, buf_len);
+
+return nvme_dma_read_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2);
+}
+
+static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+uint32_t dw11 = le32_to_cpu(cmd->cdw11);
+uint32_t dw12 = le32_to_cpu(cmd->cdw12);
+uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+uint8_t  lid = dw10 & 0xff;
+uint8_t  rae = (dw10 >> 15) & 0x1;
+uint32_t numdl, numdu;
+uint64_t off, lpol, lpou;
+size_t   len;
+
+numdl = (dw10 >> 16);
+numdu = (dw11 & 0x);
+lpol = dw12;
+lpou = dw13;
+
+len = (((numdu << 16) | numdl) + 1) << 2;
+off = (lpou << 32ULL) | lpol;
+
+if (off & 0x3) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+trace_nvm

[PATCH v7 21/48] nvme: provide the mandatory subnqn field

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d88e21a14b77..d5c293476411 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1940,6 +1940,9 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->nn = cpu_to_le32(n->num_namespaces);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
 
+pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
+pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
+
 id->psd[0].mp = cpu_to_le16(0x9c4);
 id->psd[0].enlat = cpu_to_le32(0x10);
 id->psd[0].exlat = cpu_to_le32(0x4);
-- 
2.26.0

[PATCH v7 20/48] nvme: enforce valid queue creation sequence

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Support returning Command Sequence Error if Set Features on Number of
Queues is called after queues have been created.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 12 
 hw/block/nvme.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f40bc861facc..d88e21a14b77 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -888,6 +888,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 cq = g_malloc0(sizeof(*cq));
 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
 NVME_CQ_FLAGS_IEN(qflags));
+
+/*
+ * It is only required to set qs_created when creating a completion queue;
+ * creating a submission queue without a matching completion queue will
+ * fail.
+ */
+n->qs_created = true;
 return NVME_SUCCESS;
 }
 
@@ -1202,6 +1209,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
 break;
 case NVME_NUMBER_OF_QUEUES:
+if (n->qs_created) {
+return NVME_CMD_SEQ_ERROR | NVME_DNR;
+}
+
 /*
  * NVMe v1.3, Section 5.21.1.7: 0x is not an allowed value for NCQR
  * and NSQR.
@@ -1343,6 +1354,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 
 n->aer_queued = 0;
 n->outstanding_aers = 0;
+n->qs_created = false;
 
 blk_flush(n->conf.blk);
 n->bar.cc = 0;
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 321d37aeaca4..f72ffddae160 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -99,6 +99,7 @@ typedef struct NvmeCtrl {
 BlockConfconf;
 NvmeParams   params;
 
+boolqs_created;
 uint32_tpage_size;
 uint16_tpage_bits;
 uint16_tmax_prp_ents;
-- 
2.26.0

[PATCH v7 18/48] nvme: add log specific field to trace events

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The LSP field is not used directly now, but include it in the trace.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 3 ++-
 hw/block/trace-events | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c1e3ae81666a..d4622278450e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -767,6 +767,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
 uint8_t  lid = dw10 & 0xff;
+uint8_t  lsp = (dw10 >> 8) & 0xf;
 uint8_t  rae = (dw10 >> 15) & 0x1;
 uint32_t numdl, numdu;
 uint64_t off, lpol, lpou;
@@ -784,7 +785,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-trace_nvme_dev_get_log(nvme_cid(req), lid, rae, len, off);
+trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
 
 switch (lid) {
 case NVME_LOG_ERROR_INFO:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 659091fc2fed..fb5b26f6f5f6 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -52,7 +52,7 @@ nvme_dev_getfeat_numq(int result) "get feature number of 
queues, result=%d"
 nvme_dev_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested 
cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 nvme_dev_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
 nvme_dev_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
-nvme_dev_get_log(uint16_t cid, uint8_t lid, uint8_t rae, uint32_t len, 
uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off 
%"PRIu64""
+nvme_dev_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t 
len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" 
len %"PRIu32" off %"PRIu64""
 nvme_dev_process_aers(int queued) "queued %d"
 nvme_dev_aer(uint16_t cid) "cid %"PRIu16""
 nvme_dev_aer_aerl_exceeded(void) "aerl exceeded"
-- 
2.26.0

[PATCH v7 15/48] nvme: add missing mandatory features

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add support for returning a resonable response to Get/Set Features of
mandatory features.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c   | 60 ++-
 hw/block/trace-events |  2 ++
 include/block/nvme.h  |  6 -
 3 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b45840ddf8b3..e448a1c85cb9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1025,7 +1025,15 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
 uint32_t result;
 
+trace_nvme_dev_getfeat(nvme_cid(req), dw10);
+
 switch (dw10) {
+case NVME_ARBITRATION:
+result = cpu_to_le32(n->features.arbitration);
+break;
+case NVME_POWER_MANAGEMENT:
+result = cpu_to_le32(n->features.power_mgmt);
+break;
 case NVME_TEMPERATURE_THRESHOLD:
 result = 0;
 
@@ -1046,9 +1054,12 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 break;
 }
 
+break;
+case NVME_ERROR_RECOVERY:
+result = cpu_to_le32(n->features.err_rec);
 break;
 case NVME_VOLATILE_WRITE_CACHE:
-result = blk_enable_write_cache(n->conf.blk);
+result = cpu_to_le32(blk_enable_write_cache(n->conf.blk));
 trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
 break;
 case NVME_NUMBER_OF_QUEUES:
@@ -1058,6 +1069,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 break;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, cmd);
+case NVME_INTERRUPT_COALESCING:
+result = cpu_to_le32(n->features.int_coalescing);
+break;
+case NVME_INTERRUPT_VECTOR_CONF:
+if ((dw11 & 0x) >= n->params.max_ioqpairs + 1) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+result = cpu_to_le32(n->features.int_vector_config[dw11 & 0x]);
+break;
+case NVME_WRITE_ATOMICITY:
+result = cpu_to_le32(n->features.write_atomicity);
+break;
 case NVME_ASYNCHRONOUS_EVENT_CONF:
 result = cpu_to_le32(n->features.async_config);
 break;
@@ -1093,6 +1117,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
 
+trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
+
 switch (dw10) {
 case NVME_TEMPERATURE_THRESHOLD:
 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
@@ -1120,6 +1146,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 
 break;
 case NVME_VOLATILE_WRITE_CACHE:
+if (blk_enable_write_cache(n->conf.blk)) {
+blk_flush(n->conf.blk);
+}
+
 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
 break;
 case NVME_NUMBER_OF_QUEUES:
@@ -1135,6 +1165,13 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 case NVME_ASYNCHRONOUS_EVENT_CONF:
 n->features.async_config = dw11;
 break;
+case NVME_ARBITRATION:
+case NVME_POWER_MANAGEMENT:
+case NVME_ERROR_RECOVERY:
+case NVME_INTERRUPT_COALESCING:
+case NVME_INTERRUPT_VECTOR_CONF:
+case NVME_WRITE_ATOMICITY:
+return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
 default:
 trace_nvme_dev_err_invalid_setfeat(dw10);
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -1715,6 +1752,25 @@ static void nvme_init_state(NvmeCtrl *n)
 n->temperature = NVME_TEMPERATURE;
 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+
+/*
+ * There is no limit on the number of commands that the controller may
+ * launch at one time from a particular Submission Queue.
+ */
+n->features.arbitration = NVME_ARB_AB_NOLIMIT;
+
+n->features.int_vector_config = g_malloc0_n(n->params.max_ioqpairs + 1,
+sizeof(*n->features.int_vector_config));
+
+for (int i = 0; i < n->params.max_ioqpairs + 1; i++) {
+n->features.int_vector_config[i] = i;
+
+/* interrupt coalescing is not supported for the admin queue */
+if (i == 0) {
+n->features.int_vector_config[i] |= NVME_INTVC_NOCOALESCING;
+}
+}
+
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
@@ -1803,6 +1859,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->cqes = (0x4 << 4) | 0x4;
 id->nn = cpu_to_le32(n->num_namespaces);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
+
 id->psd[0].mp = cpu_to_le16(0x9c4);
 id->psd[0].enlat = cpu_to_le32(0x10);
 id->psd[0].exlat = cpu_to_le32(0x4);
@@ -1878,6 +1935,7 @@ static void nvme_exit(PCIDevice *pci_dev)

[PATCH v7 14/48] nvme: add support for the asynchronous event request command

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
Section 5.2 ("Asynchronous Event Request command").

Mostly imported from Keith's qemu-nvme tree. Modified with a max number
of queued events (controllable with the aer_max_queued device
parameter). The spec states that the controller *should* retain
events, so we do best effort here.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 178 --
 hw/block/nvme.h   |  14 +++-
 hw/block/trace-events |   9 +++
 include/block/nvme.h  |   8 +-
 4 files changed, 199 insertions(+), 10 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 76acc112fa7e..b45840ddf8b3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -325,6 +325,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
NvmeRequest *req)
 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 }
 
+static void nvme_process_aers(void *opaque)
+{
+NvmeCtrl *n = opaque;
+NvmeAsyncEvent *event, *next;
+
+trace_nvme_dev_process_aers(n->aer_queued);
+
+QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
+NvmeRequest *req;
+NvmeAerResult *result;
+
+/* can't post cqe if there is nothing to complete */
+if (!n->outstanding_aers) {
+trace_nvme_dev_no_outstanding_aers();
+break;
+}
+
+/* ignore if masked (cqe posted, but event not cleared) */
+if (n->aer_mask & (1 << event->result.event_type)) {
+trace_nvme_dev_aer_masked(event->result.event_type, n->aer_mask);
+continue;
+}
+
+QTAILQ_REMOVE(&n->aer_queue, event, entry);
+n->aer_queued--;
+
+n->aer_mask |= 1 << event->result.event_type;
+n->outstanding_aers--;
+
+req = n->aer_reqs[n->outstanding_aers];
+
+result = (NvmeAerResult *) &req->cqe.result;
+result->event_type = event->result.event_type;
+result->event_info = event->result.event_info;
+result->log_page = event->result.log_page;
+g_free(event);
+
+req->status = NVME_SUCCESS;
+
+trace_nvme_dev_aer_post_cqe(result->event_type, result->event_info,
+result->log_page);
+
+nvme_enqueue_req_completion(&n->admin_cq, req);
+}
+}
+
+static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
+   uint8_t event_info, uint8_t log_page)
+{
+NvmeAsyncEvent *event;
+
+trace_nvme_dev_enqueue_event(event_type, event_info, log_page);
+
+if (n->aer_queued == n->params.aer_max_queued) {
+trace_nvme_dev_enqueue_event_noqueue(n->aer_queued);
+return;
+}
+
+event = g_new(NvmeAsyncEvent, 1);
+event->result = (NvmeAerResult) {
+.event_type = event_type,
+.event_info = event_info,
+.log_page   = log_page,
+};
+
+QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
+n->aer_queued++;
+
+nvme_process_aers(n);
+}
+
+static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
+{
+n->aer_mask &= ~(1 << event_type);
+if (!QTAILQ_EMPTY(&n->aer_queue)) {
+nvme_process_aers(n);
+}
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
 NvmeRequest *req = opaque;
@@ -569,8 +648,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
 return NVME_SUCCESS;
 }
 
-static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
-uint64_t off, NvmeRequest *req)
+static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
+uint32_t buf_len, uint64_t off,
+NvmeRequest *req)
 {
 uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
 uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
@@ -619,6 +699,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, 
uint32_t buf_len,
 smart.power_on_hours[0] =
 cpu_to_le64current_ms - n->starttime_ms) / 1000) / 60) / 60);
 
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_SMART);
+}
+
 return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
  prp2);
 }
@@ -643,14 +727,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd 
*cmd, uint32_t buf_len,
  prp2);
 }
 
-static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
-uint64_t off, NvmeRequest *req)
+static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
+uint32_t buf_len, uint64_t off,
+NvmeRequest *req)
 {
 uint32_t trans_len;
 uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
 uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
 NvmeErrorLog errlog;
 
+if (!rae) {
+

[PATCH v7 12/48] nvme: add temperature threshold feature

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

It might seem wierd to implement this feature for an emulated device,
but it is mandatory to support and the feature is useful for testing
asynchronous event request support, which will be added in a later
patch.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c  | 48 
 hw/block/nvme.h  |  1 +
 include/block/nvme.h |  8 +++-
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d1c42ee4765c..e777cc9075c1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -45,6 +45,9 @@
 #include "nvme.h"
 
 #define NVME_CMB_BIR 2
+#define NVME_TEMPERATURE 0x143
+#define NVME_TEMPERATURE_WARNING 0x157
+#define NVME_TEMPERATURE_CRITICAL 0x175
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -798,9 +801,31 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, 
NvmeCmd *cmd)
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+uint32_t dw11 = le32_to_cpu(cmd->cdw11);
 uint32_t result;
 
 switch (dw10) {
+case NVME_TEMPERATURE_THRESHOLD:
+result = 0;
+
+/*
+ * The controller only implements the Composite Temperature sensor, so
+ * return 0 for all other sensors.
+ */
+if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
+break;
+}
+
+switch (NVME_TEMP_THSEL(dw11)) {
+case NVME_TEMP_THSEL_OVER:
+result = cpu_to_le16(n->features.temp_thresh_hi);
+break;
+case NVME_TEMP_THSEL_UNDER:
+result = cpu_to_le16(n->features.temp_thresh_low);
+break;
+}
+
+break;
 case NVME_VOLATILE_WRITE_CACHE:
 result = blk_enable_write_cache(n->conf.blk);
 trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
@@ -845,6 +870,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
 
 switch (dw10) {
+case NVME_TEMPERATURE_THRESHOLD:
+if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
+break;
+}
+
+switch (NVME_TEMP_THSEL(dw11)) {
+case NVME_TEMP_THSEL_OVER:
+n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
+break;
+case NVME_TEMP_THSEL_UNDER:
+n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
+break;
+default:
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+break;
 case NVME_VOLATILE_WRITE_CACHE:
 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
 break;
@@ -1373,6 +1415,7 @@ static void nvme_init_state(NvmeCtrl *n)
 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
+n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
 }
 
 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -1450,6 +1493,11 @@ static void nvme_init_ctrl(NvmeCtrl *n)
 id->acl = 3;
 id->frmw = 7 << 1;
 id->lpa = 1 << 0;
+
+/* recommended default value (~70 C) */
+id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
+id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
+
 id->sqes = (0x6 << 4) | 0x6;
 id->cqes = (0x4 << 4) | 0x4;
 id->nn = cpu_to_le32(n->num_namespaces);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b7c465560eea..807c4ad19dcc 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -115,6 +115,7 @@ typedef struct NvmeCtrl {
 NvmeSQueue  admin_sq;
 NvmeCQueue  admin_cq;
 NvmeIdCtrl  id_ctrl;
+NvmeFeatureVal  features;
 } NvmeCtrl;
 
 static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index b30744068d46..a0519814ecec 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -688,7 +688,13 @@ enum NvmeIdCtrlOncs {
 typedef struct NvmeFeatureVal {
 uint32_tarbitration;
 uint32_tpower_mgmt;
-uint32_ttemp_thresh;
+union {
+struct {
+uint16_t temp_thresh_hi;
+uint16_t temp_thresh_low;
+};
+uint32_t temp_thresh;
+};
 uint32_terr_rec;
 uint32_tvolatile_wc;
 uint32_tnum_queues;
-- 
2.26.0

[PATCH v7 09/48] nvme: add max_ioqpairs device parameter

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The num_queues device paramater has a slightly confusing meaning because
it accounts for the admin queue pair which is not really optional.
Secondly, it is really a maximum value of queues allowed.

Add a new max_ioqpairs parameter that only accounts for I/O queue pairs,
but keep num_queues for compatibility.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 45 ++---
 hw/block/nvme.h |  4 +++-
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 03278726422d..f45909dad480 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,7 +19,7 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
- *  num_queues=
+ *  max_ioqpairs=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -27,6 +27,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
+#include "qemu/error-report.h"
 #include "hw/block/block.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/pci.h"
@@ -72,12 +73,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
*buf, int size)
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
 {
-return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
+return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
 }
 
 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
 {
-return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
+return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
 }
 
 static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -639,7 +640,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 trace_nvme_dev_err_invalid_create_cq_addr(prp1);
 return NVME_INVALID_FIELD | NVME_DNR;
 }
-if (unlikely(vector > n->params.num_queues)) {
+if (unlikely(vector > n->params.max_ioqpairs + 1)) {
 trace_nvme_dev_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
@@ -803,8 +804,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
 break;
 case NVME_NUMBER_OF_QUEUES:
-result = cpu_to_le32((n->params.num_queues - 2) |
- ((n->params.num_queues - 2) << 16));
+result = cpu_to_le32((n->params.max_ioqpairs - 1) |
+ ((n->params.max_ioqpairs - 1) << 16));
 trace_nvme_dev_getfeat_numq(result);
 break;
 case NVME_TIMESTAMP:
@@ -848,10 +849,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 case NVME_NUMBER_OF_QUEUES:
 trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
 ((dw11 >> 16) & 0x) + 1,
-n->params.num_queues - 1,
-n->params.num_queues - 1);
-req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
-  ((n->params.num_queues - 2) << 16));
+n->params.max_ioqpairs,
+n->params.max_ioqpairs);
+req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
+  ((n->params.max_ioqpairs - 1) << 16));
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, cmd);
@@ -924,12 +925,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 
 blk_drain(n->conf.blk);
 
-for (i = 0; i < n->params.num_queues; i++) {
+for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
 if (n->sq[i] != NULL) {
 nvme_free_sq(n->sq[i], n);
 }
 }
-for (i = 0; i < n->params.num_queues; i++) {
+for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
 if (n->cq[i] != NULL) {
 nvme_free_cq(n->cq[i], n);
 }
@@ -1330,9 +1331,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 int64_t bs_size;
 uint8_t *pci_conf;
 
-if (!n->params.num_queues) {
-error_setg(errp, "num_queues can't be zero");
-return;
+if (n->params.num_queues) {
+warn_report("nvme: num_queues is deprecated; please use max_ioqpairs "
+"instead");
+
+n->params.max_ioqpairs = n->params.num_queues - 1;
+}
+
+if (!n->params.max_ioqpairs) {
+error_setg(errp, "max_ioqpairs can't be less than 1");
 }
 
 if (!n->conf.blk) {
@@ -1363,19 +1370,19 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
 n->num_namespaces = 1;
-n->reg_size = pow2ceil(0x1000 + 2 * n->params.num_queues * 4);
+n->reg_size = pow2ceil(0x1008 + 2 * (n->params.max_ioqp

[PATCH v7 11/48] nvme: refactor device realization

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

This patch splits up nvme_realize into multiple individual functions,
each initializing a different subset of the device.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c | 178 +++-
 hw/block/nvme.h |  21 ++
 2 files changed, 136 insertions(+), 63 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 123539a5d0ae..d1c42ee4765c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -44,6 +44,8 @@
 #include "trace.h"
 #include "nvme.h"
 
+#define NVME_CMB_BIR 2
+
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
 (trace_##trace)(__VA_ARGS__); \
@@ -1322,73 +1324,112 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
-static void nvme_realize(PCIDevice *pci_dev, Error **errp)
+static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
 {
-NvmeCtrl *n = NVME(pci_dev);
-NvmeIdCtrl *id = &n->id_ctrl;
+NvmeParams *params = &n->params;
 
-int i;
-int64_t bs_size;
-uint8_t *pci_conf;
-
-if (n->params.num_queues) {
+if (params->num_queues) {
 warn_report("nvme: num_queues is deprecated; please use max_ioqpairs "
 "instead");
 
-n->params.max_ioqpairs = n->params.num_queues - 1;
+params->max_ioqpairs = params->num_queues - 1;
 }
 
-if (!n->params.max_ioqpairs) {
-error_setg(errp, "max_ioqpairs can't be less than 1");
+if (params->max_ioqpairs < 1 ||
+params->max_ioqpairs > PCI_MSIX_FLAGS_QSIZE) {
+error_setg(errp, "nvme: max_ioqpairs must be between 1 and %d",
+   PCI_MSIX_FLAGS_QSIZE);
+return -1;
 }
 
 if (!n->conf.blk) {
-error_setg(errp, "drive property not set");
-return;
+error_setg(errp, "nvme: block backend not configured");
+return -1;
 }
 
-bs_size = blk_getlength(n->conf.blk);
-if (bs_size < 0) {
-error_setg(errp, "could not get backing file size");
-return;
+if (!params->serial) {
+error_setg(errp, "nvme: serial not configured");
+return -1;
 }
 
-if (!n->params.serial) {
-error_setg(errp, "serial property not set");
-return;
-}
+return 0;
+}
+
+static int nvme_init_blk(NvmeCtrl *n, Error **errp)
+{
 blkconf_blocksizes(&n->conf);
 if (!blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
false, errp)) {
-return;
+return -1;
 }
 
-pci_conf = pci_dev->config;
-pci_conf[PCI_INTERRUPT_PIN] = 1;
-pci_config_set_prog_interface(pci_dev->config, 0x2);
-pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
-pcie_endpoint_cap_init(pci_dev, 0x80);
+return 0;
+}
 
+static void nvme_init_state(NvmeCtrl *n)
+{
 n->num_namespaces = 1;
 n->reg_size = pow2ceil(0x1008 + 2 * (n->params.max_ioqpairs) * 4);
-n->ns_size = bs_size / (uint64_t)n->num_namespaces;
-
 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
+}
 
-memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
-  "nvme", n->reg_size);
-pci_register_bar(pci_dev, 0,
-PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
-&n->iomem);
+static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
+{
+NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
+NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
+
+NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
+NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
+
+n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
+  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
+}
+
+static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev)
+{
+uint8_t *pci_conf = pci_dev->config;
+
+pci_conf[PCI_INTERRUPT_PIN] = 1;
+pci_config_set_prog_interface(pci_conf, 0x2);
+pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
+pci_config_set_device_id(pci_conf, 0x5845);
+pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
+pcie_endpoint_cap_init(pci_dev, 0x80);
+
+memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
+  n->reg_size);
+pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEM

[PATCH v7 17/48] nvme: make sure ncqr and nsqr is valid

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

0x is not an allowed value for NCQR and NSQR in Set Features on
Number of Queues.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7094767eeccb..c1e3ae81666a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1162,6 +1162,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
 break;
 case NVME_NUMBER_OF_QUEUES:
+/*
+ * NVMe v1.3, Section 5.21.1.7: 0x is not an allowed value for NCQR
+ * and NSQR.
+ */
+if ((dw11 & 0x) == 0x || ((dw11 >> 16) & 0x) == 0x) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
 ((dw11 >> 16) & 0x) + 1,
 n->params.max_ioqpairs,
-- 
2.26.0

[PATCH v7 07/48] nvme: add support for the abort command

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
Section 5.1 ("Abort command").

The Abort command is a best effort command; for now, the device always
fails to abort the given command.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 02d3dde90842..bea37c73732a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -729,6 +729,18 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 }
 }
 
+static uint16_t nvme_abort(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+uint16_t sqid = le32_to_cpu(cmd->cdw10) & 0x;
+
+req->cqe.result = 1;
+if (nvme_check_sqid(n, sqid)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
 {
 trace_nvme_dev_setfeat_timestamp(ts);
@@ -863,6 +875,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return nvme_create_cq(n, cmd);
 case NVME_ADM_CMD_IDENTIFY:
 return nvme_identify(n, cmd);
+case NVME_ADM_CMD_ABORT:
+return nvme_abort(n, cmd, req);
 case NVME_ADM_CMD_SET_FEATURES:
 return nvme_set_feature(n, cmd, req);
 case NVME_ADM_CMD_GET_FEATURES:
@@ -1373,6 +1387,19 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 id->ieee[1] = 0x02;
 id->ieee[2] = 0xb3;
 id->oacs = cpu_to_le16(0);
+
+/*
+ * Because the controller always completes the Abort command immediately,
+ * there can never be more than one concurrently executing Abort command,
+ * so this value is never used for anything. Note that there can easily be
+ * many Abort commands in the queues, but they are not considered
+ * "executing" until processed by nvme_abort.
+ *
+ * The specification recommends a value of 3 for Abort Command Limit (four
+ * concurrently outstanding Abort commands), so lets use that though it is
+ * inconsequential.
+ */
+id->acl = 3;
 id->frmw = 7 << 1;
 id->lpa = 1 << 0;
 id->sqes = (0x6 << 4) | 0x6;
-- 
2.26.0

[PATCH v7 08/48] nvme: fix pci doorbell size calculation

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

The size of the BAR is 0x1000 (main registers) + 8 bytes for each
queue. Currently, the size of the BAR is calculated like so:

n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);

Since the 'num_queues' parameter already accounts for the admin queue,
this should in any case not need to be incremented by one. Also, the
size should be initialized to (0x1000).

n->reg_size = pow2ceil(0x1000 + 2 * n->params.num_queues * 4);

This, with the default value of num_queues (64), we will set aside room
for 1 admin queue and 63 I/O queues (4 bytes per doorbell, 2 doorbells
per queue).

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index bea37c73732a..03278726422d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1363,7 +1363,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
 n->num_namespaces = 1;
-n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
+n->reg_size = pow2ceil(0x1000 + 2 * n->params.num_queues * 4);
 n->ns_size = bs_size / (uint64_t)n->num_namespaces;
 
 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
-- 
2.26.0

[PATCH v7 06/48] nvme: refactor nvme_addr_read

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Pull the controller memory buffer check to its own function. The check
will be used on its own in later patches.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 622103c42d0a..02d3dde90842 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -52,14 +52,22 @@
 
 static void nvme_process_sq(void *opaque);
 
+static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
+{
+hwaddr low = n->ctrl_mem.addr;
+hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+
+return addr >= low && addr < hi;
+}
+
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->cmbsz && addr >= n->ctrl_mem.addr &&
-addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
+if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
 memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
-} else {
-pci_dma_read(&n->parent_obj, addr, buf, size);
+return;
 }
+
+pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
-- 
2.26.0

[PATCH v7 01/48] nvme: rename trace events to nvme_dev

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Change the prefix of all nvme device related trace events to 'nvme_dev'
to not clash with trace events from the nvme block driver.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 190 +-
 hw/block/trace-events | 172 +++---
 2 files changed, 180 insertions(+), 182 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf377..01e18fb9eb1f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -112,16 +112,16 @@ static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
 {
 if (cq->irq_enabled) {
 if (msix_enabled(&(n->parent_obj))) {
-trace_nvme_irq_msix(cq->vector);
+trace_nvme_dev_irq_msix(cq->vector);
 msix_notify(&(n->parent_obj), cq->vector);
 } else {
-trace_nvme_irq_pin();
+trace_nvme_dev_irq_pin();
 assert(cq->cqid < 64);
 n->irq_status |= 1 << cq->cqid;
 nvme_irq_check(n);
 }
 } else {
-trace_nvme_irq_masked();
+trace_nvme_dev_irq_masked();
 }
 }
 
@@ -146,7 +146,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
-trace_nvme_err_invalid_prp();
+trace_nvme_dev_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
 } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
@@ -160,7 +160,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 len -= trans_len;
 if (len) {
 if (unlikely(!prp2)) {
-trace_nvme_err_invalid_prp2_missing();
+trace_nvme_dev_err_invalid_prp2_missing();
 goto unmap;
 }
 if (len > n->page_size) {
@@ -176,7 +176,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 
 if (i == n->max_prp_ents - 1 && len > n->page_size) {
 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-trace_nvme_err_invalid_prplist_ent(prp_ent);
+trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
 goto unmap;
 }
 
@@ -189,7 +189,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 }
 
 if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
-trace_nvme_err_invalid_prplist_ent(prp_ent);
+trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
 goto unmap;
 }
 
@@ -204,7 +204,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 }
 } else {
 if (unlikely(prp2 & (n->page_size - 1))) {
-trace_nvme_err_invalid_prp2_align(prp2);
+trace_nvme_dev_err_invalid_prp2_align(prp2);
 goto unmap;
 }
 if (qsg->nsg) {
@@ -252,20 +252,20 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t 
*ptr, uint32_t len,
 QEMUIOVector iov;
 uint16_t status = NVME_SUCCESS;
 
-trace_nvme_dma_read(prp1, prp2);
+trace_nvme_dev_dma_read(prp1, prp2);
 
 if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 if (qsg.nsg > 0) {
 if (unlikely(dma_buf_read(ptr, len, &qsg))) {
-trace_nvme_err_invalid_dma();
+trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
 qemu_sglist_destroy(&qsg);
 } else {
 if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
-trace_nvme_err_invalid_dma();
+trace_nvme_dev_err_invalid_dma();
 status = NVME_INVALID_FIELD | NVME_DNR;
 }
 qemu_iovec_destroy(&iov);
@@ -354,7 +354,7 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeCmd *cmd,
 uint32_t count = nlb << data_shift;
 
 if (unlikely(slba + nlb > ns->id_ns.nsze)) {
-trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
 return NVME_LBA_RANGE | NVME_DNR;
 }
 
@@ -382,11 +382,11 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
-trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
 
 if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-trac

[PATCH v7 05/48] nvme: use constants in identify

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 088668f28bae..622103c42d0a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -679,7 +679,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify 
*c)
 
 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
 {
-static const int data_len = 4 * KiB;
+static const int data_len = NVME_IDENTIFY_DATA_SIZE;
 uint32_t min_nsid = le32_to_cpu(c->nsid);
 uint64_t prp1 = le64_to_cpu(c->prp1);
 uint64_t prp2 = le64_to_cpu(c->prp2);
@@ -709,11 +709,11 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 NvmeIdentify *c = (NvmeIdentify *)cmd;
 
 switch (le32_to_cpu(c->cns)) {
-case 0x00:
+case NVME_ID_CNS_NS:
 return nvme_identify_ns(n, c);
-case 0x01:
+case NVME_ID_CNS_CTRL:
 return nvme_identify_ctrl(n, c);
-case 0x02:
+case NVME_ID_CNS_NS_ACTIVE_LIST:
 return nvme_identify_nslist(n, c);
 default:
 trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));
-- 
2.26.0

[PATCH v7 04/48] nvme: bump spec data structures to v1.3

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Add missing fields in the Identify Controller and Identify Namespace
data structures to bring them in line with NVMe v1.3.

This also adds data structures and defines for SGL support which
requires a couple of trivial changes to the nvme block driver as well.

Signed-off-by: Klaus Jensen 
Acked-by: Fam Zheng 
Reviewed-by: Maxim Levitsky 
---
 block/nvme.c |  18 ++---
 hw/block/nvme.c  |  12 ++--
 include/block/nvme.h | 162 ++-
 3 files changed, 160 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 7b7c0cc5d673..7302cc19ade4 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 error_setg(errp, "Cannot map buffer for DMA");
 goto out;
 }
-cmd.prp1 = cpu_to_le64(iova);
+cmd.dptr.prp1 = cpu_to_le64(iova);
 
 if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 error_setg(errp, "Failed to identify controller");
@@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 }
 cmd = (NvmeCmd) {
 .opcode = NVME_ADM_CMD_CREATE_CQ,
-.prp1 = cpu_to_le64(q->cq.iova),
+.dptr.prp1 = cpu_to_le64(q->cq.iova),
 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
 .cdw11 = cpu_to_le32(0x3),
 };
@@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 }
 cmd = (NvmeCmd) {
 .opcode = NVME_ADM_CMD_CREATE_SQ,
-.prp1 = cpu_to_le64(q->sq.iova),
+.dptr.prp1 = cpu_to_le64(q->sq.iova),
 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
 .cdw11 = cpu_to_le32(0x1 | (n << 16)),
 };
@@ -906,16 +906,16 @@ try_map:
 case 0:
 abort();
 case 1:
-cmd->prp1 = pagelist[0];
-cmd->prp2 = 0;
+cmd->dptr.prp1 = pagelist[0];
+cmd->dptr.prp2 = 0;
 break;
 case 2:
-cmd->prp1 = pagelist[0];
-cmd->prp2 = pagelist[1];
+cmd->dptr.prp1 = pagelist[0];
+cmd->dptr.prp2 = pagelist[1];
 break;
 default:
-cmd->prp1 = pagelist[0];
-cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
+cmd->dptr.prp1 = pagelist[0];
+cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
 break;
 }
 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 249f759f076e..088668f28bae 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -372,8 +372,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
 uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
-uint64_t prp1 = le64_to_cpu(rw->prp1);
-uint64_t prp2 = le64_to_cpu(rw->prp2);
+uint64_t prp1 = le64_to_cpu(rw->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(rw->dptr.prp2);
 
 uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
@@ -763,8 +763,8 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
 
 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
 {
-uint64_t prp1 = le64_to_cpu(cmd->prp1);
-uint64_t prp2 = le64_to_cpu(cmd->prp2);
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
 
 uint64_t timestamp = nvme_get_timestamp(n);
 
@@ -802,8 +802,8 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
NvmeCmd *cmd)
 {
 uint16_t ret;
 uint64_t timestamp;
-uint64_t prp1 = le64_to_cpu(cmd->prp1);
-uint64_t prp2 = le64_to_cpu(cmd->prp2);
+uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
+uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
 
 ret = nvme_dma_write_prp(n, (uint8_t *)×tamp,
 sizeof(timestamp), prp1, prp2);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8fb941c6537c..b30744068d46 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -205,15 +205,53 @@ enum NvmeCmbszMask {
 #define NVME_CMBSZ_GETSIZE(cmbsz) \
 (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz
 
+enum NvmeSglDescriptorType {
+NVME_SGL_DESCR_TYPE_DATA_BLOCK  = 0x0,
+NVME_SGL_DESCR_TYPE_BIT_BUCKET  = 0x1,
+NVME_SGL_DESCR_TYPE_SEGMENT = 0x2,
+NVME_SGL_DESCR_TYPE_LAST_SEGMENT= 0x3,
+NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK= 0x4,
+
+NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf,
+};
+
+enum NvmeSglDescriptorSubtype {
+NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0,
+};
+
+typedef struct NvmeSglDescriptor {
+uint64_t addr;
+uint32_t len;
+uint8_t  rsvd[3];
+uint8_t  type;
+} NvmeSglDescriptor;
+
+#define NVME_SGL_TYPE(type) ((type >> 4) & 0xf)
+#define NVME_SGL_SUBTYPE(type)  (type & 0xf)
+
+typedef union

[PATCH v7 10/48] nvme: remove redundant cmbloc/cmbsz members

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 7 ++-
 hw/block/nvme.h | 2 --
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f45909dad480..123539a5d0ae 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -63,7 +63,7 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
 memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
 return;
 }
@@ -157,7 +157,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (unlikely(!prp1)) {
 trace_nvme_dev_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
+} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
@@ -1443,9 +1443,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
 
-n->cmbloc = n->bar.cmbloc;
-n->cmbsz = n->bar.cmbsz;
-
 n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
 memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
   "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 98f5b9479244..ad1786953be9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -88,8 +88,6 @@ typedef struct NvmeCtrl {
 uint32_tnum_namespaces;
 uint32_tmax_q_ents;
 uint64_tns_size;
-uint32_tcmbsz;
-uint32_tcmbloc;
 uint8_t *cmbuf;
 uint64_tirq_status;
 uint64_thost_timestamp; /* Timestamp sent by the host 
*/
-- 
2.26.0

[PATCH v7 00/48] nvme: support NVMe v1.3d, SGLs and multiple namespaces

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Hi,

v7 is mostly just changes proposed by Maxim. Also, Gollu's patch for the
bit bucket sgl descriptor has been added (patch #43), but it is of a
pretty manageable size.



Changes since v6


* 01/48 ("nvme: rename trace events to nvme_dev")
  - indentation nitpicks (Maxim)

* 04/48 ("nvme: bump spec data structures to v1.3")
  - added defines for temperature threshold and sensor selection (Maxim)
  - added defines for the EUI64 and NGUID NIDT types (Maxim)

* 09/48 ("nvme: add max_ioqpairs device parameter")
  - hoisted a bugfix out (Maxim)

* 11/48 ("nvme: refactor device realization")
  - pulled the removal of the duplicated cmbsz/cmbloc struct members to
separate patch (Maxim)
  - fixed an incomplete error message (Maxim)

* 12/48 ("nvme: add temperature threshold feature")
  - use the defines added in 04/48 (Maxim)
  - remove the addition of the unused 'temperature' NvmeCtrl member

* 13/48 ("nvme: add support for the get log page command")
  - use the already defined NvmeErrorLog type (Maxim)
  - add the 'temperature' NvmeCtrl member here instead (Maxim)

* 15/48 ("nvme: add missing mandatory features")
  - fix off-by-one when validating dw11 for the interrupt vector config
feature (Maxim)
  - some nitpicks (Maxim)

* 16/48 ("nvme: additional tracing")
  - moved some traces (that was added in later patches) to here

* 19/48 ("nvme: support identify namespace descriptor list")
  - refactored for the nth time; I think I'm finally kind of happy with
it

* 20/48 ("nvme: enforce valid queue creation sequence")
  - added a comment (Maxim)

* 24/48 ("nvme: add mapping helpers")
  - fixed an off-by-one error that would cause valid read/writes to the
CMB to be rejected (Maxim)
  - add a check on len so we don't add 0 sized elements to the iovec
(Maxim)

* 26/48 ("nvme: remove redundant has_sg member")
  - pulled the s/dma_acct/block_acct change to a separate patch (Maxim)

* 36/48 ("nvme: allow multiple aios per command")
  - renamed nvme_req_register_aio to nvme_req_add_aio and added some
comments (Maxim)
  - killed the prinfo patch
  - pulled a bunch of refactoring stuff into other patches to make this
one easier to read (Maxim)

* 42/48 ("nvme: add support for scatter gather lists")
  - pulled the nvme_addr_read hardening into its own patch (Maxim)
  - refactored the logic around allowed sgl descriptor types (Maxim)
  - additional comments (Maxim)

* 45/48 ("nvme: support multiple namespaces")
  - fix an inconsistent return value
  - fix an edge error case where all namespaces are allocated (Maxim)



git-backport-diff against rebased v6


Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/48:[0006] [FC] 'nvme: rename trace events to nvme_dev'
002/48:[] [--] 'nvme: remove superfluous breaks'
003/48:[] [--] 'nvme: move device parameters to separate struct'
004/48:[0017] [FC] 'nvme: bump spec data structures to v1.3'
005/48:[down]  'nvme: use constants in identify'
006/48:[] [--] 'nvme: refactor nvme_addr_read'
007/48:[] [--] 'nvme: add support for the abort command'
008/48:[down]  'nvme: fix pci doorbell size calculation'
009/48:[0002] [FC] 'nvme: add max_ioqpairs device parameter'
010/48:[down]  'nvme: remove redundant cmbloc/cmbsz members'
011/48:[0022] [FC] 'nvme: refactor device realization'
012/48:[0013] [FC] 'nvme: add temperature threshold feature'
013/48:[0007] [FC] 'nvme: add support for the get log page command'
014/48:[] [-C] 'nvme: add support for the asynchronous event request 
command'
015/48:[0002] [FC] 'nvme: add missing mandatory features'
016/48:[0014] [FC] 'nvme: additional tracing'
017/48:[] [--] 'nvme: make sure ncqr and nsqr is valid'
018/48:[] [--] 'nvme: add log specific field to trace events'
019/48:[0029] [FC] 'nvme: support identify namespace descriptor list'
020/48:[0005] [FC] 'nvme: enforce valid queue creation sequence'
021/48:[] [--] 'nvme: provide the mandatory subnqn field'
022/48:[] [--] 'nvme: bump supported version to v1.3'
023/48:[] [--] 'nvme: memset preallocated requests structures'
024/48:[0010] [FC] 'nvme: add mapping helpers'
025/48:[down]  'nvme: replace dma_acct with blk_acct equivalent'
026/48:[0007] [FC] 'nvme: remove redundant has_sg member'
027/48:[0011] [FC] 'nvme: refactor dma read/write'
028/48:[0004] [FC] 'nvme: pass request along for tracing'
029/48:[] [--] 'nvme: add request mapping helper'
030/48:[] [--] 'nvme: verify validity of prp lists in the cmb'
031/48:[] [-C] 'nvme: refactor request bounds checking'
032/48:[] [--] 'nvme: add check for mdts'
033/48:[down]  'nvme: be consistent about zeros vs zeroes'
034/48:[down]  'nvme: refactor NvmeRequest'
035/48:[down]  'nvme: remove NvmeCmd parameter

[PATCH v7 02/48] nvme: remove superfluous breaks

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

These break statements was left over when commit 3036a626e9ef ("nvme:
add Get/Set Feature Timestamp support") was merged.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 01e18fb9eb1f..da0e8af42823 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -788,7 +788,6 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 break;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, cmd);
-break;
 default:
 trace_nvme_dev_err_invalid_getfeat(dw10);
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -832,11 +831,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 req->cqe.result =
 cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
 break;
-
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, cmd);
-break;
-
 default:
 trace_nvme_dev_err_invalid_setfeat(dw10);
 return NVME_INVALID_FIELD | NVME_DNR;
-- 
2.26.0

[PATCH v7 03/48] nvme: move device parameters to separate struct

2020-04-14 Thread Klaus Jensen

From: Klaus Jensen 

Move device configuration parameters to separate struct to make it
explicit what is configurable and what is set internally.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 44 ++--
 hw/block/nvme.h | 16 +---
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index da0e8af42823..249f759f076e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -64,12 +64,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
*buf, int size)
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
 {
-return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
+return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
 }
 
 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
 {
-return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
+return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
 }
 
 static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -631,7 +631,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 trace_nvme_dev_err_invalid_create_cq_addr(prp1);
 return NVME_INVALID_FIELD | NVME_DNR;
 }
-if (unlikely(vector > n->num_queues)) {
+if (unlikely(vector > n->params.num_queues)) {
 trace_nvme_dev_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
@@ -783,7 +783,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
 break;
 case NVME_NUMBER_OF_QUEUES:
-result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 
16));
+result = cpu_to_le32((n->params.num_queues - 2) |
+ ((n->params.num_queues - 2) << 16));
 trace_nvme_dev_getfeat_numq(result);
 break;
 case NVME_TIMESTAMP:
@@ -827,9 +828,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 case NVME_NUMBER_OF_QUEUES:
 trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
 ((dw11 >> 16) & 0x) + 1,
-n->num_queues - 1, n->num_queues - 1);
-req->cqe.result =
-cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
+n->params.num_queues - 1,
+n->params.num_queues - 1);
+req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
+  ((n->params.num_queues - 2) << 16));
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, cmd);
@@ -900,12 +902,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
 
 blk_drain(n->conf.blk);
 
-for (i = 0; i < n->num_queues; i++) {
+for (i = 0; i < n->params.num_queues; i++) {
 if (n->sq[i] != NULL) {
 nvme_free_sq(n->sq[i], n);
 }
 }
-for (i = 0; i < n->num_queues; i++) {
+for (i = 0; i < n->params.num_queues; i++) {
 if (n->cq[i] != NULL) {
 nvme_free_cq(n->cq[i], n);
 }
@@ -1306,7 +1308,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 int64_t bs_size;
 uint8_t *pci_conf;
 
-if (!n->num_queues) {
+if (!n->params.num_queues) {
 error_setg(errp, "num_queues can't be zero");
 return;
 }
@@ -1322,7 +1324,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 return;
 }
 
-if (!n->serial) {
+if (!n->params.serial) {
 error_setg(errp, "serial property not set");
 return;
 }
@@ -1339,25 +1341,25 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
 n->num_namespaces = 1;
-n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
+n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
 n->ns_size = bs_size / (uint64_t)n->num_namespaces;
 
 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
-n->sq = g_new0(NvmeSQueue *, n->num_queues);
-n->cq = g_new0(NvmeCQueue *, n->num_queues);
+n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
+n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
 
 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
   "nvme", n->reg_size);
 pci_register_bar(pci_dev, 0,
 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
 &n->iomem);
-msix_init_exclusive_bar(pci_dev, n->num_queues, 4, NULL);
+msix_init_exclusive_bar(pci_dev, n->params.num_queues, 4, NULL);
 
 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
 strpadcpy((char *)id->mn, sizeof(id->m

[PATCH v2] hvf: use standard CR0 and CR4 register definitions

2020-04-14 Thread Cameron Esfahani via

Signed-off-by: Cameron Esfahani 
---

v2:
+ Fix duplicate line Roman Bolshakov  found in review.
---
 target/i386/cpu.h  |  2 ++
 target/i386/hvf/hvf.c  |  2 +-
 target/i386/hvf/vmx.h  | 15 ---
 target/i386/hvf/x86.c  |  6 +++---
 target/i386/hvf/x86.h  | 34 --
 target/i386/hvf/x86_mmu.c  |  2 +-
 target/i386/hvf/x86_task.c |  3 ++-
 7 files changed, 17 insertions(+), 47 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 60d797d594..1286ec6e7a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -225,6 +225,8 @@ typedef enum X86Seg {
 #define CR0_NE_MASK  (1U << 5)
 #define CR0_WP_MASK  (1U << 16)
 #define CR0_AM_MASK  (1U << 18)
+#define CR0_NW_MASK  (1U << 29)
+#define CR0_CD_MASK  (1U << 30)
 #define CR0_PG_MASK  (1U << 31)
 
 #define CR4_VME_MASK  (1U << 0)
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index d72543dc31..48f3ef050c 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -455,7 +455,7 @@ void hvf_reset_vcpu(CPUState *cpu) {
 wvmcs(cpu->hvf_fd, VMCS_GUEST_PDPTE0 + i * 2, pdpte[i]);
 }
 
-macvm_set_cr0(cpu->hvf_fd, 0x6010);
+macvm_set_cr0(cpu->hvf_fd, CR0_CD_MASK | CR0_NW_MASK | CR0_ET_MASK);
 
 wvmcs(cpu->hvf_fd, VMCS_CR4_MASK, CR4_VMXE_MASK);
 wvmcs(cpu->hvf_fd, VMCS_CR4_SHADOW, 0x0);
diff --git a/target/i386/hvf/vmx.h b/target/i386/hvf/vmx.h
index 03d2c79b9c..8ec2e6414e 100644
--- a/target/i386/hvf/vmx.h
+++ b/target/i386/hvf/vmx.h
@@ -121,9 +121,10 @@ static inline void macvm_set_cr0(hv_vcpuid_t vcpu, 
uint64_t cr0)
 uint64_t pdpte[4] = {0, 0, 0, 0};
 uint64_t efer = rvmcs(vcpu, VMCS_GUEST_IA32_EFER);
 uint64_t old_cr0 = rvmcs(vcpu, VMCS_GUEST_CR0);
-uint64_t mask = CR0_PG | CR0_CD | CR0_NW | CR0_NE | CR0_ET;
+uint64_t mask = CR0_PG_MASK | CR0_CD_MASK | CR0_NW_MASK |
+CR0_NE_MASK | CR0_ET_MASK;
 
-if ((cr0 & CR0_PG) && (rvmcs(vcpu, VMCS_GUEST_CR4) & CR4_PAE) &&
+if ((cr0 & CR0_PG_MASK) && (rvmcs(vcpu, VMCS_GUEST_CR4) & CR4_PAE_MASK) &&
 !(efer & MSR_EFER_LME)) {
 address_space_read(&address_space_memory,
rvmcs(vcpu, VMCS_GUEST_CR3) & ~0x1f,
@@ -138,17 +139,17 @@ static inline void macvm_set_cr0(hv_vcpuid_t vcpu, 
uint64_t cr0)
 wvmcs(vcpu, VMCS_CR0_SHADOW, cr0);
 
 if (efer & MSR_EFER_LME) {
-if (!(old_cr0 & CR0_PG) && (cr0 & CR0_PG)) {
+if (!(old_cr0 & CR0_PG_MASK) && (cr0 & CR0_PG_MASK)) {
 enter_long_mode(vcpu, cr0, efer);
 }
-if (/*(old_cr0 & CR0_PG) &&*/ !(cr0 & CR0_PG)) {
+if (!(cr0 & CR0_PG_MASK)) {
 exit_long_mode(vcpu, cr0, efer);
 }
 }
 
 /* Filter new CR0 after we are finished examining it above. */
-cr0 = (cr0 & ~(mask & ~CR0_PG));
-wvmcs(vcpu, VMCS_GUEST_CR0, cr0 | CR0_NE | CR0_ET);
+cr0 = (cr0 & ~(mask & ~CR0_PG_MASK));
+wvmcs(vcpu, VMCS_GUEST_CR0, cr0 | CR0_NE_MASK | CR0_ET_MASK);
 
 hv_vcpu_invalidate_tlb(vcpu);
 hv_vcpu_flush(vcpu);
@@ -156,7 +157,7 @@ static inline void macvm_set_cr0(hv_vcpuid_t vcpu, uint64_t 
cr0)
 
 static inline void macvm_set_cr4(hv_vcpuid_t vcpu, uint64_t cr4)
 {
-uint64_t guest_cr4 = cr4 | CR4_VMXE;
+uint64_t guest_cr4 = cr4 | CR4_VMXE_MASK;
 
 wvmcs(vcpu, VMCS_GUEST_CR4, guest_cr4);
 wvmcs(vcpu, VMCS_CR4_SHADOW, cr4);
diff --git a/target/i386/hvf/x86.c b/target/i386/hvf/x86.c
index 3afcedc7fc..668c02de6e 100644
--- a/target/i386/hvf/x86.c
+++ b/target/i386/hvf/x86.c
@@ -119,7 +119,7 @@ bool x86_read_call_gate(struct CPUState *cpu, struct 
x86_call_gate *idt_desc,
 bool x86_is_protected(struct CPUState *cpu)
 {
 uint64_t cr0 = rvmcs(cpu->hvf_fd, VMCS_GUEST_CR0);
-return cr0 & CR0_PE;
+return cr0 & CR0_PE_MASK;
 }
 
 bool x86_is_real(struct CPUState *cpu)
@@ -150,13 +150,13 @@ bool x86_is_long64_mode(struct CPUState *cpu)
 bool x86_is_paging_mode(struct CPUState *cpu)
 {
 uint64_t cr0 = rvmcs(cpu->hvf_fd, VMCS_GUEST_CR0);
-return cr0 & CR0_PG;
+return cr0 & CR0_PG_MASK;
 }
 
 bool x86_is_pae_enabled(struct CPUState *cpu)
 {
 uint64_t cr4 = rvmcs(cpu->hvf_fd, VMCS_GUEST_CR4);
-return cr4 & CR4_PAE;
+return cr4 & CR4_PAE_MASK;
 }
 
 target_ulong linear_addr(struct CPUState *cpu, target_ulong addr, X86Seg seg)
diff --git a/target/i386/hvf/x86.h b/target/i386/hvf/x86.h
index c95d5b2116..bc0170b2a8 100644
--- a/target/i386/hvf/x86.h
+++ b/target/i386/hvf/x86.h
@@ -100,40 +100,6 @@ typedef struct x86_reg_flags {
 };
 } __attribute__ ((__packed__)) x86_reg_flags;
 
-typedef enum x86_reg_cr0 {
-CR0_PE =(1L << 0),
-CR0_MP =(1L << 1),
-CR0_EM =(1L << 2),
-CR0_TS =(1L << 3),
-CR0_ET =(1L << 4),
-CR0_NE =(1L << 5),
-CR0_WP =(1L << 16),
-CR0_AM =(1L << 18),
-CR0_NW =(1L << 29),
-CR0_CD =(1L << 30),
-

[PATCH v2] nrf51: Fix last GPIO CNF address

2020-04-14 Thread Cameron Esfahani via

NRF51_GPIO_REG_CNF_END doesn't actually refer to the start of the last
valid CNF register: it's referring to the last byte of the last valid
CNF register.

This hasn't been a problem up to now, as current implementation in
memory.c turns an unaligned 4-byte read from 0x77f to a single byte read
and the qtest only looks at the least-significant byte of the register.

But when running with patches which fix unaligned accesses in memory.c,
the qtest breaks.

Considering NRF51 doesn't support unaligned accesses, the simplest fix
is to actually set NRF51_GPIO_REG_CNF_END to the start of the last valid
CNF register: 0x77c.

Now, qtests work with or without the unaligned access patches.

Reviewed-by: Cédric Le Goater 
Tested-by: Cédric Le Goater 
Reviewed-by: Joel Stanley 
Signed-off-by: Cameron Esfahani 
---
 include/hw/gpio/nrf51_gpio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hw/gpio/nrf51_gpio.h b/include/hw/gpio/nrf51_gpio.h
index 337ee534bb..1d62bbc928 100644
--- a/include/hw/gpio/nrf51_gpio.h
+++ b/include/hw/gpio/nrf51_gpio.h
@@ -42,7 +42,7 @@
 #define NRF51_GPIO_REG_DIRSET   0x518
 #define NRF51_GPIO_REG_DIRCLR   0x51C
 #define NRF51_GPIO_REG_CNF_START0x700
-#define NRF51_GPIO_REG_CNF_END  0x77F
+#define NRF51_GPIO_REG_CNF_END  0x77C
 
 #define NRF51_GPIO_PULLDOWN 1
 #define NRF51_GPIO_PULLUP 3
-- 
2.24.0

[PATCH v2 1/3] exec: fetch the alignment of Linux devdax pmem character device nodes

2020-04-14 Thread Jingqi Liu

If the backend file is devdax pmem character device, the alignment
specified by the option 'align=NUM' in the '-object memory-backend-file'
needs to match the alignment requirement of the devdax pmem character device.

This patch uses the interfaces of libdaxctl to fetch the devdax pmem file
'align', so that we can compare it with the NUM of 'align=NUM'.
The NUM needs to be larger than or equal to the devdax pmem file 'align'.

It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.

Suggested-by: Dan Williams 
Signed-off-by: Jingqi Liu 
---
 exec.c | 54 +-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/exec.c b/exec.c
index de9d949902..2c3444e47e 100644
--- a/exec.c
+++ b/exec.c
@@ -77,6 +77,10 @@
 
 #include "monitor/monitor.h"
 
+#ifdef CONFIG_LIBDAXCTL
+#include 
+#endif
+
 //#define DEBUG_SUBPAGE
 
 #if !defined(CONFIG_USER_ONLY)
@@ -1736,6 +1740,46 @@ static int64_t get_file_size(int fd)
 return size;
 }
 
+static int64_t get_file_align(int fd)
+{
+int64_t align = -1;
+#if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
+struct stat st;
+
+if (fstat(fd, &st) < 0) {
+return -errno;
+}
+
+/* Special handling for devdax character devices */
+if (S_ISCHR(st.st_mode)) {
+g_autofree char *path = NULL;
+g_autofree char *rpath = NULL;
+struct daxctl_ctx *ctx;
+struct daxctl_region *region;
+int rc = 0;
+
+path = g_strdup_printf("/sys/dev/char/%d:%d",
+major(st.st_rdev), minor(st.st_rdev));
+rpath = realpath(path, NULL);
+
+rc = daxctl_new(&ctx);
+if (rc) {
+return -1;
+}
+
+daxctl_region_foreach(ctx, region) {
+if (strstr(rpath, daxctl_region_get_path(region))) {
+align = daxctl_region_get_align(region);
+break;
+}
+}
+daxctl_unref(ctx);
+}
+#endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
+
+return align;
+}
+
 static int file_ram_open(const char *path,
  const char *region_name,
  bool *created,
@@ -2275,7 +2319,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
MemoryRegion *mr,
 {
 RAMBlock *new_block;
 Error *local_err = NULL;
-int64_t file_size;
+int64_t file_size, file_align;
 
 /* Just support these ram flags by now. */
 assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
@@ -2311,6 +2355,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
MemoryRegion *mr,
 return NULL;
 }
 
+file_align = get_file_align(fd);
+if (file_align > 0 && mr && file_align > mr->align) {
+error_setg(errp, "backing store align 0x%" PRIx64
+   " is larger than 'align' option 0x" RAM_ADDR_FMT,
+   file_align, mr->align);
+return NULL;
+}
+
 new_block = g_malloc0(sizeof(*new_block));
 new_block->mr = mr;
 new_block->used_length = size;
-- 
2.17.1

[PATCH v2 2/3] docs/nvdimm: add description of alignment requirement of device dax

2020-04-14 Thread Jingqi Liu

For device dax (e.g., /dev/dax0.0), the NUM of 'align=NUM' option
needs to match the alignment requirement of the device dax.
It must be larger than or equal to the 'align' of device dax.

Signed-off-by: Jingqi Liu 
---
 docs/nvdimm.txt | 9 +
 1 file changed, 9 insertions(+)

diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
index 362e99109e..3c7b6dab5f 100644
--- a/docs/nvdimm.txt
+++ b/docs/nvdimm.txt
@@ -132,6 +132,15 @@ address to the page size (getpagesize(2)) by default. 
However, some
 types of backends may require an alignment different than the page
 size. In that case, QEMU v2.12.0 and later provide 'align' option to
 memory-backend-file to allow users to specify the proper alignment.
+For device dax (e.g., /dev/dax0.0), this alignment needs to match the
+alignment requirement of the device dax. The NUM of 'align=NUM' option
+must be larger than or equal to the 'align' of device dax.
+We can use the following command to show the 'align' of device dax.
+
+ndctl list -X
+
+In order to get the proper 'align' of device dax, you need to install
+the library 'libdaxctl'.
 
 For example, device dax require the 2 MB alignment, so we can use
 following QEMU command line options to use it (/dev/dax0.0) as the
-- 
2.17.1

[PATCH v2 0/3] fetch the alignment of device dax

2020-04-14 Thread Jingqi Liu

This series adds libdaxctl support and fetchs the alignment of
device dax through libdaxctl [1] APIs.

QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
address to the page size (getpagesize(2)) by default. However, some
types of backends may require an alignment different than the page
size. The 'align' option is provided to memory-backend-file to allow
users to specify the proper alignment.

For device dax (e.g., /dev/dax0.0), the 'align' option needs to
match the alignment requirement of the device dax, which can be fetched
through the libdaxctl APIs.

[1] Libdaxctl is a part of ndctl project.
The project's repository is: https://github.com/pmem/ndctl

Changelog:
  v2: Per Paolo and Dan suggestions, fetch the alignment of device dax
  through libdaxctl APIs.

  v1: The initial version.
  Fetch the alignment through "/sys/dev/char/%d:%d/device/align".

Jingqi Liu (3):
  exec: fetch the alignment of Linux devdax pmem character device nodes
  docs/nvdimm: add description of alignment requirement of device dax
  configure: add libdaxctl support

 configure   | 30 +++
 docs/nvdimm.txt |  9 +
 exec.c  | 54 -
 3 files changed, 92 insertions(+), 1 deletion(-)

-- 
2.17.1

[PATCH v2 3/3] configure: add libdaxctl support

2020-04-14 Thread Jingqi Liu

Add a pair of configure options --{enable,disable}-libdaxctl to control
whether QEMU is compiled with libdaxctl [1]. Libdaxctl is a utility
library for managing the device dax subsystem.

QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
address to the page size (getpagesize(2)) by default. However, some
types of backends may require an alignment different than the page
size. The 'align' option is provided to memory-backend-file to allow
users to specify the proper alignment.

For device dax (e.g., /dev/dax0.0), the 'align' option needs to match
the alignment requirement of the device dax, which can be fetched
through the libdaxctl APIs.

[1] Libdaxctl is a part of ndctl project.
The project's repository is: https://github.com/pmem/ndctl

For more information about libdaxctl APIs, you can refer to the
comments in source code of: pmem/ndctl/daxctl/lib/libdaxctl.c.

Signed-off-by: Jingqi Liu 
---
 configure | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/configure b/configure
index e225a1e3ff..df1752cf08 100755
--- a/configure
+++ b/configure
@@ -509,6 +509,7 @@ libpmem=""
 default_devices="yes"
 plugins="no"
 fuzzing="no"
+libdaxctl=""
 
 supported_cpu="no"
 supported_os="no"
@@ -1601,6 +1602,10 @@ for opt do
   ;;
   --gdb=*) gdb_bin="$optarg"
   ;;
+  --enable-libdaxctl) libdaxctl=yes
+  ;;
+  --disable-libdaxctl) libdaxctl=no
+  ;;
   *)
   echo "ERROR: unknown option $opt"
   echo "Try '$0 --help' for more information"
@@ -1894,6 +1899,7 @@ disabled with --disable-FEATURE, default is enabled if 
available:
   debug-mutex mutex debugging support
   libpmem libpmem support
   xkbcommon   xkbcommon support
+  libdaxctl   libdaxctl support
 
 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -6190,6 +6196,25 @@ if test "$libpmem" != "no"; then
fi
 fi
 
+##
+# check for libdaxctl
+
+if test "$libdaxctl" != "no"; then
+   if $pkg_config --exists "libdaxctl"; then
+   libdaxctl="yes"
+   libdaxctl_libs=$($pkg_config --libs libdaxctl)
+   libdaxctl_cflags=$($pkg_config --cflags libdaxctl)
+   libs_softmmu="$libs_softmmu $libdaxctl_libs"
+   QEMU_CFLAGS="$QEMU_CFLAGS $libdaxctl_cflags"
+   else
+   if test "$libdaxctl" = "yes" ; then
+   feature_not_found "libdaxctl" "Install libdaxctl"
+   fi
+   libdaxctl="no"
+   fi
+fi
+
+
 ##
 # check for slirp
 
@@ -6767,6 +6792,7 @@ echo "parallels support $parallels"
 echo "sheepdog support  $sheepdog"
 echo "capstone  $capstone"
 echo "libpmem support   $libpmem"
+echo "libdaxctl support $libdaxctl"
 echo "libudev   $libudev"
 echo "default devices   $default_devices"
 echo "plugin support$plugins"
@@ -7590,6 +7616,10 @@ if test "$libpmem" = "yes" ; then
   echo "CONFIG_LIBPMEM=y" >> $config_host_mak
 fi
 
+if test "$libdaxctl" = "yes" ; then
+  echo "CONFIG_LIBDAXCTL=y" >> $config_host_mak
+fi
+
 if test "$bochs" = "yes" ; then
   echo "CONFIG_BOCHS=y" >> $config_host_mak
 fi
-- 
2.17.1

[PATCH 3/4] char-socket: avoid double call tcp_chr_free_connection

2020-04-14 Thread Li Feng

double call tcp_chr_free_connection generates a crash.

Signed-off-by: Li Feng 
---
 chardev/char-socket.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 185fe38dda..43aab8f24b 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -476,6 +476,11 @@ static void tcp_chr_disconnect_locked(Chardev *chr)
 SocketChardev *s = SOCKET_CHARDEV(chr);
 bool emit_close = s->state == TCP_CHARDEV_STATE_CONNECTED;
 
+/* avoid re-enter when socket read/write error and disconnect event. */
+if (s->state == TCP_CHARDEV_STATE_DISCONNECTED) {
+return;
+}
+
 tcp_chr_free_connection(chr);
 
 if (s->listener) {
-- 
2.11.0


-- 
The SmartX email address is only for business purpose. Any sent message 
that is not related to the business is not authorized or permitted by 
SmartX.
本邮箱为北京志凌海纳科技有限公司（SmartX）工作邮箱. 如本邮箱发出的邮件与工作无关,该邮件未得到本公司任何的明示或默示的授权.

[PATCH 1/4] vhost-user-blk: delay vhost_user_blk_disconnect

2020-04-14 Thread Li Feng

Since commit b0a335e351103bf92f3f9d0bd5759311be8156ac, a socket write
may trigger a disconnect events, calling vhost_user_blk_disconnect() and
clearing all the vhost_dev strutures. Then the next socket read will
encounter an invalid pointer to vhost_dev.

Signed-off-by: Li Feng 
---
 hw/block/vhost-user-blk.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 17df5338e7..776b9af3eb 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -349,11 +349,24 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
 vhost_dev_cleanup(&s->dev);
 }
 
+static void vhost_user_blk_event(void *opaque, QEMUChrEvent event);
+
+static void vhost_user_blk_chr_closed_bh(void *opaque)
+{
+DeviceState *dev = opaque;
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VHostUserBlk *s = VHOST_USER_BLK(vdev);
+vhost_user_blk_disconnect(dev);
+qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, vhost_user_blk_event,
+ NULL, (void *)dev, NULL, true);
+}
+
 static void vhost_user_blk_event(void *opaque, QEMUChrEvent event)
 {
 DeviceState *dev = opaque;
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
+AioContext *ctx;
 
 switch (event) {
 case CHR_EVENT_OPENED:
@@ -363,7 +376,16 @@ static void vhost_user_blk_event(void *opaque, 
QEMUChrEvent event)
 }
 break;
 case CHR_EVENT_CLOSED:
-vhost_user_blk_disconnect(dev);
+/*
+ * a close event may happen during a read/write, but vhost
+ * code assumes the vhost_dev remains setup, so delay the
+ * stop & clear to idle.
+ */
+ctx = qemu_get_current_aio_context();
+
+qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, NULL,
+ NULL, NULL, NULL, false);
+aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
 break;
 case CHR_EVENT_BREAK:
 case CHR_EVENT_MUX_IN:
-- 
2.11.0


-- 
The SmartX email address is only for business purpose. Any sent message 
that is not related to the business is not authorized or permitted by 
SmartX.
本邮箱为北京志凌海纳科技有限公司（SmartX）工作邮箱. 如本邮箱发出的邮件与工作无关,该邮件未得到本公司任何的明示或默示的授权.

[PATCH 2/4] vhost-user-blk: fix invalid memory access

2020-04-14 Thread Li Feng

when s->inflight is freed, vhost_dev_free_inflight may try to access
s->inflight->addr, it will retrigger the following issue.

==7309==ERROR: AddressSanitizer: heap-use-after-free on address 0x604001020d18 
at pc 0x55ce948a bp 0x7fffb170 sp 0x7fffb160
READ of size 8 at 0x604001020d18 thread T0
#0 0x55ce9489 in vhost_dev_free_inflight 
/root/smartx/qemu-el7/qemu-test/hw/virtio/vhost.c:1473
#1 0x55cd86eb in virtio_reset 
/root/smartx/qemu-el7/qemu-test/hw/virtio/virtio.c:1214
#2 0x560d3eff in virtio_pci_reset hw/virtio/virtio-pci.c:1859
#3 0x55f2ac53 in device_set_realized hw/core/qdev.c:893
#4 0x561d572c in property_set_bool qom/object.c:1925
#5 0x561de8de in object_property_set_qobject qom/qom-qobject.c:27
#6 0x561d99f4 in object_property_set_bool qom/object.c:1188
#7 0x55e50ae7 in qdev_device_add 
/root/smartx/qemu-el7/qemu-test/qdev-monitor.c:626
#8 0x55e51213 in qmp_device_add 
/root/smartx/qemu-el7/qemu-test/qdev-monitor.c:806
#9 0x55e8ff40 in hmp_device_add 
/root/smartx/qemu-el7/qemu-test/hmp.c:1951
#10 0x55be889a in handle_hmp_command 
/root/smartx/qemu-el7/qemu-test/monitor.c:3404
#11 0x55beac8b in monitor_command_cb 
/root/smartx/qemu-el7/qemu-test/monitor.c:4296
#12 0x56433eb7 in readline_handle_byte util/readline.c:393
#13 0x55be89ec in monitor_read 
/root/smartx/qemu-el7/qemu-test/monitor.c:4279
#14 0x563285cc in tcp_chr_read chardev/char-socket.c:470
#15 0x7670b968 in g_main_context_dispatch 
(/lib64/libglib-2.0.so.0+0x4a968)
#16 0x5640727c in glib_pollfds_poll util/main-loop.c:215
#17 0x5640727c in os_host_main_loop_wait util/main-loop.c:238
#18 0x5640727c in main_loop_wait util/main-loop.c:497
#19 0x55b2d0bf in main_loop /root/smartx/qemu-el7/qemu-test/vl.c:2013
#20 0x55b2d0bf in main /root/smartx/qemu-el7/qemu-test/vl.c:4776
#21 0x7fffdd2eb444 in __libc_start_main (/lib64/libc.so.6+0x22444)
#22 0x55b3767a  
(/root/smartx/qemu-el7/qemu-test/x86_64-softmmu/qemu-system-x86_64+0x5e367a)

0x604001020d18 is located 8 bytes inside of 40-byte region 
[0x604001020d10,0x604001020d38)
freed by thread T0 here:
#0 0x76f00508 in __interceptor_free (/lib64/libasan.so.4+0xde508)
#1 0x7671107d in g_free (/lib64/libglib-2.0.so.0+0x5007d)

previously allocated by thread T0 here:
#0 0x76f00a88 in __interceptor_calloc (/lib64/libasan.so.4+0xdea88)
#1 0x76710fc5 in g_malloc0 (/lib64/libglib-2.0.so.0+0x4ffc5)

SUMMARY: AddressSanitizer: heap-use-after-free 
/root/smartx/qemu-el7/qemu-test/hw/virtio/vhost.c:1473 in 
vhost_dev_free_inflight
Shadow bytes around the buggy address:
  0x0c08801fc150: fa fa 00 00 00 00 04 fa fa fa fd fd fd fd fd fa
  0x0c08801fc160: fa fa fd fd fd fd fd fd fa fa 00 00 00 00 04 fa
  0x0c08801fc170: fa fa 00 00 00 00 00 01 fa fa 00 00 00 00 04 fa
  0x0c08801fc180: fa fa 00 00 00 00 00 01 fa fa 00 00 00 00 00 01
  0x0c08801fc190: fa fa 00 00 00 00 00 fa fa fa 00 00 00 00 04 fa
=>0x0c08801fc1a0: fa fa fd[fd]fd fd fd fa fa fa fd fd fd fd fd fa
  0x0c08801fc1b0: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fa
  0x0c08801fc1c0: fa fa 00 00 00 00 00 fa fa fa fd fd fd fd fd fd
  0x0c08801fc1d0: fa fa 00 00 00 00 00 01 fa fa fd fd fd fd fd fa
  0x0c08801fc1e0: fa fa fd fd fd fd fd fa fa fa fd fd fd fd fd fd
  0x0c08801fc1f0: fa fa 00 00 00 00 00 01 fa fa fd fd fd fd fd fa
Shadow byte legend (one shadow byte represents 8 application bytes):
  Addressable:   00
  Partially addressable: 01 02 03 04 05 06 07
  Heap left redzone:   fa
  Freed heap region:   fd
  Stack left redzone:  f1
  Stack mid redzone:   f2
  Stack right redzone: f3
  Stack after return:  f5
  Stack use after scope:   f8
  Global redzone:  f9
  Global init order:   f6
  Poisoned by user:f7
  Container overflow:  fc
  Array cookie:ac
  Intra object redzone:bb
  ASan internal:   fe
  Left alloca redzone: ca
  Right alloca redzone:cb
==7309==ABORTING

Signed-off-by: Li Feng 
---
 hw/block/vhost-user-blk.c | 4 
 hw/virtio/vhost.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 776b9af3eb..19e79b96e4 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -463,7 +463,9 @@ reconnect:
 
 virtio_err:
 g_free(s->vhost_vqs);
+s->vhost_vqs = NULL;
 g_free(s->inflight);
+s->inflight = NULL;
 for (i = 0; i < s->num_queues; i++) {
 virtio_delete_queue(s->virtqs[i]);
 }
@@ -484,7 +486,9 @@ static void vhost_user_blk_device_unrealize(DeviceState 
*dev, Error **errp)
 vhost_dev_cleanup(&s->dev);
 vhost_dev_free_inflight(s->inflight);
 g_free(s->vhost_vqs);
+s->vhost_vqs = NULL;
 g_free(s->inflight);
+s->inflight = NULL;
 
 for (i = 0; i < s->num_queues; i++) {
 vi

[PATCH 4/4] vhost-user-blk: fix crash in realize process

2020-04-14 Thread Li Feng

The crash could be reproduced like this:
1. break vhost_user_write;
2. kill the vhost-user-blk target;
3. let qemu continue running;
4. start vhost-user-blk;
5. see crash!

This fix makes changes:
1. set s->connected to true after vhost_dev_init;
2. call vhost_dev_get_config when s->connected is true, otherwise the
hdev->host_ops will be nullptr.

Signed-off-by: Li Feng 
---
 hw/block/vhost-user-blk.c | 47 +--
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 19e79b96e4..35386b7cb7 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -303,8 +303,6 @@ static int vhost_user_blk_connect(DeviceState *dev)
 if (s->connected) {
 return 0;
 }
-s->connected = true;
-
 s->dev.nvqs = s->num_queues;
 s->dev.vqs = s->vhost_vqs;
 s->dev.vq_index = 0;
@@ -318,6 +316,11 @@ static int vhost_user_blk_connect(DeviceState *dev)
  strerror(-ret));
 return ret;
 }
+/*
+ * set true util vhost_dev_init return ok, because CLOSE event may happen
+ * in vhost_dev_init routine.
+ */
+s->connected = true;
 
 /* restore vhost state */
 if (virtio_device_started(vdev, vdev->status)) {
@@ -401,6 +404,7 @@ static void vhost_user_blk_device_realize(DeviceState *dev, 
Error **errp)
 VHostUserBlk *s = VHOST_USER_BLK(vdev);
 Error *err = NULL;
 int i, ret;
+bool reconnect;
 
 if (!s->chardev.chr) {
 error_setg(errp, "vhost-user-blk: chardev is mandatory");
@@ -433,27 +437,26 @@ static void vhost_user_blk_device_realize(DeviceState 
*dev, Error **errp)
 s->inflight = g_new0(struct vhost_inflight, 1);
 s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
 s->connected = false;
+reconnect = false;
 
-qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, vhost_user_blk_event,
- NULL, (void *)dev, NULL, true);
-
-reconnect:
-if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) {
-error_report_err(err);
-goto virtio_err;
-}
-
-/* check whether vhost_user_blk_connect() failed or not */
-if (!s->connected) {
-goto reconnect;
-}
-
-ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
-   sizeof(struct virtio_blk_config));
-if (ret < 0) {
-error_report("vhost-user-blk: get block config failed");
-goto reconnect;
-}
+do {
+if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) {
+error_report_err(err);
+goto virtio_err;
+}
+qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, 
vhost_user_blk_event,
+ NULL, (void *)dev, NULL, true);
+if (s->connected) {
+ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
+   sizeof(struct virtio_blk_config));
+if (ret < 0) {
+error_report("vhost-user-blk: get block config failed");
+reconnect = true;
+} else {
+reconnect = false;
+}
+}
+} while (!s->connected || reconnect);
 
 if (s->blkcfg.num_queues != s->num_queues) {
 s->blkcfg.num_queues = s->num_queues;
-- 
2.11.0


-- 
The SmartX email address is only for business purpose. Any sent message 
that is not related to the business is not authorized or permitted by 
SmartX.
本邮箱为北京志凌海纳科技有限公司（SmartX）工作邮箱. 如本邮箱发出的邮件与工作无关,该邮件未得到本公司任何的明示或默示的授权.

[PATCH 0/4] fix crashes when inject errors to vhost-user-blk chardev

2020-04-14 Thread Li Feng

The following patches fix various crashes happened when injecting errors to
chardev unix domain socket.

The crashes are encountered when the socket is from connected to disconnected at
vhost-user-blk realize routine.

These crashes could be reproduced like this:
1. gdb break at vhost_user_write;
2. add a vhost-user-blk device through qmp;
3. when stop at vhost_user_write, kill the vhost-user-blk target;
3. let qemu continue running;
4. start vhost-user-blk;
5. see crash!

The 'CLOSE' event path is core trouble maker.

qemu_chr_fe_set_handlers
   -> vhost_user_blk_event(OPEN)
   -> vhost_user_blk_connect
-> vhost_dev_init
-> vhost_user_blk_event(CLOSE)
-> vhost_dev_cleanup


Li Feng (4):
  vhost-user-blk: delay vhost_user_blk_disconnect
  vhost-user-blk: fix invalid memory access
  char-socket: avoid double call tcp_chr_free_connection
  vhost-user-blk: fix crash in realize process

 chardev/char-socket.c |  5 
 hw/block/vhost-user-blk.c | 75 ---
 hw/virtio/vhost.c |  2 +-
 3 files changed, 58 insertions(+), 24 deletions(-)

-- 
2.11.0


-- 
The SmartX email address is only for business purpose. Any sent message 
that is not related to the business is not authorized or permitted by 
SmartX.
本邮箱为北京志凌海纳科技有限公司（SmartX）工作邮箱. 如本邮箱发出的邮件与工作无关,该邮件未得到本公司任何的明示或默示的授权.

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-04-14 Thread Rafael David Tinoco

** Changed in: qemu (Ubuntu Eoan)
 Assignee: Rafael David Tinoco (rafaeldtinoco) => (unassigned)

** Changed in: qemu
 Assignee: Rafael David Tinoco (rafaeldtinoco) => (unassigned)

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Incomplete
Status in QEMU:
  In Progress
Status in qemu package in Ubuntu:
  Incomplete
Status in qemu source package in Bionic:
  Incomplete
Status in qemu source package in Disco:
  Incomplete
Status in qemu source package in Eoan:
  Incomplete
Status in qemu source package in Focal:
  Incomplete

Bug description:
  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  [ Original Description ]

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1  0xbbefaf00 in ppoll (__ss=0x0, __timeout=0x0, __nfds=,
  __fds=) at /usr/include/aarch64-linux-gnu/bits/poll2.h:77
  #2  qemu_poll_ns (fds=, nfds=,
  timeout=timeout@entry=-1) at util/qemu-timer.c:322
  #3  0xbbefbf80 in os_host_main_loop_wait (timeout=-1)
  at util/main-loop.c:233
  #4  main_loop_wait (nonblocking=) at util/main-loop.c:497
  #5  0xbbe2aa30 in convert_do_copy (s=0xc123bb58) at 
qemu-img.c:1980
  #6  img_convert (argc=, argv=) at 
qemu-img.c:2456
  #7  0xbbe2333c in main (argc=7, argv=) at 
qemu-img.c:4975

  Reproduced w/ latest QEMU git (@ 53744e0a182)

To manage notifications about this bug go to:
https://bugs.launchpad.net/kunpeng920/+bug/1805256/+subscriptions

Re: [RFC PATCH v1 00/26] VM introspection

2020-04-14 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200415005938.23895-1-ala...@bitdefender.com/



Hi,

This series failed the docker-mingw@fedora build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#! /bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-mingw@fedora J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  block/blklogwrites.o
  CC  block/block-backend.o

Warning, treated as error:
/tmp/qemu-test/src/docs/../qemu-options.hx:5041:Inline literal start-string 
without end-string.

Warning, treated as error:
/tmp/qemu-test/src/docs/../qemu-options.hx:5041:Inline literal start-string 
without end-string.
  CC  block/snapshot.o
  CC  block/qapi.o
---
  CC  block/file-win32.o
  CC  block/null.o
  CC  block/mirror.o
make: *** [Makefile:1115: 
.docs_system_qemu.1_docs_system_qemu-block-drivers.7_docs_system_qemu-cpu-models.7.sentinel.]
 Error 2
make: *** Deleting file 
'.docs_system_qemu.1_docs_system_qemu-block-drivers.7_docs_system_qemu-cpu-models.7.sentinel.'
make: *** Waiting for unfinished jobs
make: *** [Makefile:1104: docs/system/index.html] Error 2
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=c583cfe64588a174af36ef6d29c3', '-u', 
'1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-6xohf4_4/src/docker-src.2020-04-14-22.23.28.25626:/var/tmp/qemu:z,ro',
 'qemu:fedora', '/var/tmp/qemu/run', 'test-mingw']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=c583cfe64588a174af36ef6d29c3
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-6xohf4_4/src'
make: *** [docker-run-test-mingw@fedora] Error 2

real2m43.146s
user0m6.958s


The full log is available at
http://patchew.org/logs/20200415005938.23895-1-ala...@bitdefender.com/testing.docker-mingw@fedora/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH-for-5.1] gdbstub: Rename GByteArray variable 'mem_buf' as 'array'

2020-04-14 Thread David Gibson

On Tue, Apr 14, 2020 at 01:28:48PM +0200, Philippe Mathieu-Daudé wrote:
> GByteArray type has should not be treated as a u8[] buffer.
> The GLib Byte Arrays API should be used instead.
> Rename the 'mem_buf' variable as 'array' to make it more
> obvious in the code.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> Based-on: <20200414111846.27495-1-phi...@redhat.com>
> Signed-off-by: Philippe Mathieu-Daudé 

ppc parts

Acked-by: David Gibson 

> ---
>  include/exec/gdbstub.h  | 34 +++---
>  include/hw/core/cpu.h   |  2 +-
>  target/alpha/cpu.h  |  2 +-
>  target/arm/cpu.h|  4 +-
>  target/cris/cpu.h   |  4 +-
>  target/hppa/cpu.h   |  2 +-
>  target/i386/cpu.h   |  2 +-
>  target/lm32/cpu.h   |  2 +-
>  target/m68k/cpu.h   |  2 +-
>  target/microblaze/cpu.h |  2 +-
>  target/mips/internal.h  |  2 +-
>  target/openrisc/cpu.h   |  2 +-
>  target/ppc/cpu.h|  4 +-
>  target/riscv/cpu.h  |  2 +-
>  target/rx/cpu.h |  2 +-
>  target/s390x/internal.h |  2 +-
>  target/sh4/cpu.h|  2 +-
>  target/sparc/cpu.h  |  2 +-
>  target/xtensa/cpu.h |  2 +-
>  gdbstub.c   |  6 +--
>  hw/core/cpu.c   |  3 +-
>  target/alpha/gdbstub.c  |  4 +-
>  target/arm/gdbstub.c| 10 ++--
>  target/arm/gdbstub64.c  | 10 ++--
>  target/cris/gdbstub.c   | 34 +++---
>  target/hppa/gdbstub.c   |  6 +--
>  target/i386/gdbstub.c   | 92 ++---
>  target/lm32/gdbstub.c   | 18 
>  target/m68k/gdbstub.c   | 10 ++--
>  target/m68k/helper.c| 24 +-
>  target/microblaze/gdbstub.c |  6 +--
>  target/mips/gdbstub.c   | 30 ++--
>  target/nios2/cpu.c  |  8 ++--
>  target/openrisc/gdbstub.c   | 10 ++--
>  target/riscv/gdbstub.c  |  6 +--
>  target/rx/gdbstub.c | 22 -
>  target/s390x/gdbstub.c  | 28 +--
>  target/sh4/gdbstub.c| 38 +++
>  target/sparc/gdbstub.c  | 46 +--
>  target/xtensa/gdbstub.c | 20 
>  40 files changed, 254 insertions(+), 253 deletions(-)
> 
> diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
> index 52a4a936c6..29150d1344 100644
> --- a/include/exec/gdbstub.h
> +++ b/include/exec/gdbstub.h
> @@ -80,47 +80,47 @@ void gdb_register_coprocessor(CPUState *cpu,
>   * append to the array.
>   */
>  
> -static inline int gdb_get_reg8(GByteArray *buf, uint8_t val)
> +static inline int gdb_get_reg8(GByteArray *array, uint8_t val)
>  {
> -g_byte_array_append(buf, &val, 1);
> +g_byte_array_append(array, &val, 1);
>  return 1;
>  }
>  
> -static inline int gdb_get_reg16(GByteArray *buf, uint16_t val)
> +static inline int gdb_get_reg16(GByteArray *array, uint16_t val)
>  {
>  uint16_t to_word = tswap16(val);
> -g_byte_array_append(buf, (uint8_t *) &to_word, 2);
> +g_byte_array_append(array, (uint8_t *) &to_word, 2);
>  return 2;
>  }
>  
> -static inline int gdb_get_reg32(GByteArray *buf, uint32_t val)
> +static inline int gdb_get_reg32(GByteArray *array, uint32_t val)
>  {
>  uint32_t to_long = tswap32(val);
> -g_byte_array_append(buf, (uint8_t *) &to_long, 4);
> +g_byte_array_append(array, (uint8_t *) &to_long, 4);
>  return 4;
>  }
>  
> -static inline int gdb_get_reg64(GByteArray *buf, uint64_t val)
> +static inline int gdb_get_reg64(GByteArray *array, uint64_t val)
>  {
>  uint64_t to_quad = tswap64(val);
> -g_byte_array_append(buf, (uint8_t *) &to_quad, 8);
> +g_byte_array_append(array, (uint8_t *) &to_quad, 8);
>  return 8;
>  }
>  
> -static inline int gdb_get_reg128(GByteArray *buf, uint64_t val_hi,
> +static inline int gdb_get_reg128(GByteArray *array, uint64_t val_hi,
>   uint64_t val_lo)
>  {
>  uint64_t to_quad;
>  #ifdef TARGET_WORDS_BIGENDIAN
>  to_quad = tswap64(val_hi);
> -g_byte_array_append(buf, (uint8_t *) &to_quad, 8);
> +g_byte_array_append(array, (uint8_t *) &to_quad, 8);
>  to_quad = tswap64(val_lo);
> -g_byte_array_append(buf, (uint8_t *) &to_quad, 8);
> +g_byte_array_append(array, (uint8_t *) &to_quad, 8);
>  #else
>  to_quad = tswap64(val_lo);
> -g_byte_array_append(buf, (uint8_t *) &to_quad, 8);
> +g_byte_array_append(array, (uint8_t *) &to_quad, 8);
>  to_quad = tswap64(val_hi);
> -g_byte_array_append(buf, (uint8_t *) &to_quad, 8);
> +g_byte_array_append(array, (uint8_t *) &to_quad, 8);
>  #endif
>  return 16;
>  }
> @@ -154,16 +154,16 @@ static inline int gdb_get_zeroes(GByteArray *array, 
> size_t len)
>   * element for additional processing. Some front-ends do additional
>   * dynamic swapping of the elements based on CPU state.
>   */
> -static inline uint8_t * gdb_get_reg_ptr(GByteArray *buf, int len)
> +static inline uint8_t *gdb_get_reg_ptr(GByteArray *array, int len)
>  {
> -ret

Re: [PATCH RFC v2] target/arm: Implement SVE2 MATCH, NMATCH

2020-04-14 Thread Richard Henderson

On 4/14/20 4:16 PM, Stephen Long wrote:
> Signed-off-by: Stephen Long 
> ---
>  target/arm/helper-sve.h| 10 
>  target/arm/sve.decode  |  5 
>  target/arm/sve_helper.c| 51 ++
>  target/arm/translate-sve.c | 22 
>  4 files changed, 88 insertions(+)
> 
> diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
> index 5dd880cf6d..bc4a463bc7 100644
> --- a/target/arm/helper-sve.h
> +++ b/target/arm/helper-sve.h
> @@ -2516,6 +2516,16 @@ DEF_HELPER_FLAGS_3(sve2_uqrshrnt_h, TCG_CALL_NO_RWG, 
> void, ptr, ptr, i32)
>  DEF_HELPER_FLAGS_3(sve2_uqrshrnt_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
>  DEF_HELPER_FLAGS_3(sve2_uqrshrnt_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
>  
> +DEF_HELPER_FLAGS_5(sve2_match_ppzz_b, TCG_CALL_NO_RWG,
> +   i32, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(sve2_match_ppzz_h, TCG_CALL_NO_RWG,
> +   i32, ptr, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_b, TCG_CALL_NO_RWG,
> +   i32, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(sve2_nmatch_ppzz_h, TCG_CALL_NO_RWG,
> +   i32, ptr, ptr, ptr, ptr, i32)
> +
>  DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_h, TCG_CALL_NO_RWG,
> void, ptr, ptr, ptr, ptr, ptr, i32)
>  DEF_HELPER_FLAGS_6(sve2_faddp_zpzz_s, TCG_CALL_NO_RWG,
> diff --git a/target/arm/sve.decode b/target/arm/sve.decode
> index 374e47fb05..652668df02 100644
> --- a/target/arm/sve.decode
> +++ b/target/arm/sve.decode
> @@ -1305,6 +1305,11 @@ UQSHRNT 01000101 .. 1 . 00 1101 . 
> .  @rd_rn_tszimm_shr
>  UQRSHRNB01000101 .. 1 . 00 1110 . .  @rd_rn_tszimm_shr
>  UQRSHRNT01000101 .. 1 . 00  . .  @rd_rn_tszimm_shr
>  
> +### SVE2 Character Match
> +
> +MATCH   01000101 .. 1 . 100 ... . 0  @pd_pg_rn_rm
> +NMATCH  01000101 .. 1 . 100 ... . 1  @pd_pg_rn_rm
> +
>  ## SVE2 floating-point pairwise operations
>  
>  FADDP   01100100 .. 010 00 0 100 ... . . @rdn_pg_rm
> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
> index b68f62cd7f..78abd8b62a 100644
> --- a/target/arm/sve_helper.c
> +++ b/target/arm/sve_helper.c
> @@ -6890,3 +6890,54 @@ DO_ST1_ZPZ_D(dd_be, zd, MO_64)
>  
>  #undef DO_ST1_ZPZ_S
>  #undef DO_ST1_ZPZ_D
> +
> +#define DO_PPZZ_CHAR_MATCH(NAME, TYPE, OP, H, MASK, DEFAULT_VAL) 
>  \
> +static inline bool NAME##_inner_loop(TYPE nn, void *segmentbase) 
>  \
> +{
>  \
> +intptr_t i = 128;
>  \
> +do { 
>  \
> +do { 
>  \
> +i -= sizeof(TYPE) * 8;   
>  \
> +TYPE mm = *(TYPE *)(segmentbase + H1(i));
>  \
> +if (nn OP mm) {  
>  \
> +return !DEFAULT_VAL; 
>  \
> +}
>  \
> +} while (i & 63);
>  \
> +} while (i > 0); 
>  \
> +return DEFAULT_VAL;  
>  \
> +}
>  \

You seem to be mixing up bit and bytes here, with 128 bits and H1 as a byte 
index.

I note that we don't need to keep re-loading the Zm segment elements from
memory.  Perhaps something like

static inline bool do_match1(uint64_t n, uint64_t m, int esz)
{
int i, bits = 8 << esz;
n = extract64(n, 0, bits);
for (i = 0; i < 64; i += bits) {
if (n == extract64(m, i, bits)) {
return true;
}
}
return false;
}

static inline bool do_match2(uint64_t n, uint64_t m0,
 uint64_t m1, int esz)
{
return do_match1(n, m0, esz) || do_match1(n, m1, esz);
}


As an improvement, we can use

https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord

static inline bool do_match2(uint64_t n, uint64_t m0,
 uint64_t m1, int esz)
{
int bits = 8 << esz;
uint64_t ones = dup_const(esz, 1);
uint64_t signs = ones << (bits - 1);
uint64_t cmp0, cmp1;

cmp1 = dup_const(esz, n);
cmp0 = cmp1 ^ m0;
cmp1 = cmp1 ^ m1;
cmp0 = (cmp0 - ones) & ~cmp0;
cmp1 = (cmp1 - ones) & ~cmp1;
return (cmp0 | cmp1) & signs;
}


> +uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) 
>  \
> +{
>  \
> +intptr_t

Re: [RFC PATCH v1 00/26] VM introspection

2020-04-14 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200415005938.23895-1-ala...@bitdefender.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  chardev/char-mux.o
  CC  chardev/char-null.o

Warning, treated as error:
/tmp/qemu-test/src/docs/../qemu-options.hx:5041:Inline literal start-string 
without end-string.
make: *** [Makefile:1115: 
.docs_system_qemu.1_docs_system_qemu-block-drivers.7_docs_system_qemu-cpu-models.7.sentinel.]
 Error 2
make: *** Deleting file 
'.docs_system_qemu.1_docs_system_qemu-block-drivers.7_docs_system_qemu-cpu-models.7.sentinel.'
make: *** Waiting for unfinished jobs

Warning, treated as error:
/tmp/qemu-test/src/docs/../qemu-options.hx:5041:Inline literal start-string 
without end-string.
make: *** [Makefile:1104: docs/system/index.html] Error 2
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=14969c1b7a7f4cb589d53f5ee4a705f4', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 
'TARGET_LIST=x86_64-softmmu', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 
'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', 
'-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-a4iu7vy2/src/docker-src.2020-04-14-21.58.48.26010:/var/tmp/qemu:z,ro',
 'qemu:fedora', '/var/tmp/qemu/run', 'test-debug']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=14969c1b7a7f4cb589d53f5ee4a705f4
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-a4iu7vy2/src'
make: *** [docker-run-test-debug@fedora] Error 2

real3m22.713s
user0m7.733s


The full log is available at
http://patchew.org/logs/20200415005938.23895-1-ala...@bitdefender.com/testing.asan/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 for 5.0-rc3 00/17] more randome fixes (user, pie, docker and gdbstub)

2020-04-14 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200414200631.12799-1-alex.ben...@linaro.org/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Subject: [PATCH  v2 for 5.0-rc3 00/17] more randome fixes (user, pie, docker 
and gdbstub)
Message-id: 20200414200631.12799-1-alex.ben...@linaro.org
Type: series

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Switched to a new branch 'test'
1d9b560 tests/tcg: add a multiarch linux-user gdb test
042 tests/tcg: drop inferior.was_attached() test
d252cf5 target/m68k: hack around the FPU register support (HACK!)
fe2d373 gdbstub: Introduce gdb_get_float64() to get 64-bit float registers
a067a36 gdbstub: Introduce gdb_get_float32() to get 32-bit float registers
22159df gdbstub: Do not use memset() on GByteArray
0440574 gdbstub: i386: Fix gdb_get_reg16() parameter to unbreak gdb
661c288 target/m68k/helper: Fix m68k_fpu_gdb_get_reg() use of GByteArray
6d7f04a linux-user: fix /proc/self/stat handling
8077d16 configure: disable PIE for Windows builds
3525a95 configure: redirect sphinx-build check to config.log
d7cc6ea tests/docker: add docs FEATURE flag and use for test-misc
2faca0a linux-user/ppc: Fix padding in mcontext_t for ppc64
bd002dd .gitignore: include common build sub-directories
2cb4915 accel/tcg: Relax va restrictions on 64-bit guests
b81214a exec/cpu-all: Use bool for have_guest_base
56dda0e linux-user: completely re-write init_guest_space

=== OUTPUT BEGIN ===
1/17 Checking commit 56dda0ee6d96 (linux-user: completely re-write 
init_guest_space)
2/17 Checking commit b81214a4a7b1 (exec/cpu-all: Use bool for have_guest_base)
3/17 Checking commit 2cb4915f8641 (accel/tcg: Relax va restrictions on 64-bit 
guests)
ERROR: Macros with complex values should be enclosed in parenthesis
#91: FILE: include/exec/cpu-all.h:182:
+# define GUEST_ADDR_MAX_  ~0ul

total: 1 errors, 0 warnings, 88 lines checked

Patch 3/17 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

4/17 Checking commit bd002e97 (.gitignore: include common build 
sub-directories)
5/17 Checking commit 2faca0a20813 (linux-user/ppc: Fix padding in mcontext_t 
for ppc64)
6/17 Checking commit d7cc6eab6083 (tests/docker: add docs FEATURE flag and use 
for test-misc)
7/17 Checking commit 3525a9514500 (configure: redirect sphinx-build check to 
config.log)
8/17 Checking commit 8077d168511c (configure: disable PIE for Windows builds)
9/17 Checking commit 6d7f04aaf64a (linux-user: fix /proc/self/stat handling)
10/17 Checking commit 661c288d6e96 (target/m68k/helper: Fix 
m68k_fpu_gdb_get_reg() use of GByteArray)
11/17 Checking commit 04405743253c (gdbstub: i386: Fix gdb_get_reg16() 
parameter to unbreak gdb)
12/17 Checking commit 22159df16cb9 (gdbstub: Do not use memset() on GByteArray)
13/17 Checking commit a067a369add1 (gdbstub: Introduce gdb_get_float32() to get 
32-bit float registers)
14/17 Checking commit fe2d3733f79b (gdbstub: Introduce gdb_get_float64() to get 
64-bit float registers)
15/17 Checking commit d252cf53d601 (target/m68k: hack around the FPU register 
support (HACK!))
ERROR: space prohibited after that '*' (ctx:BxW)
#88: FILE: target/m68k/helper.c:130:
+env->fregs[n].l.lower = le64_to_cpu(* (uint64_t *) mem_buf);
 ^

ERROR: space prohibited after that '*' (ctx:BxW)
#89: FILE: target/m68k/helper.c:131:
+env->fregs[n].l.upper = le16_to_cpu(* (uint16_t *) (mem_buf + 8));
 ^

total: 2 errors, 0 warnings, 45 lines checked

Patch 15/17 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

16/17 Checking commit 0422b3bd (tests/tcg: drop inferior.was_attached() 
test)
17/17 Checking commit 1d9b560da197 (tests/tcg: add a multiarch linux-user gdb 
test)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#40: 
new file mode 100644

total: 0 errors, 1 warnings, 101 lines checked

Patch 17/17 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20200414200631.12799-1-alex.ben...@linaro.org/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[RFC PATCH v1 26/26] kvm: vmi: add 'command' and 'event' properties

2020-04-14 Thread Adalbert Lazăr

There are cases when the access to an introspected VM must be limited
to certain introspection commands/events.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 86 ++---
 1 file changed, 74 insertions(+), 12 deletions(-)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index f70d78848a..1574a643c4 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -73,6 +73,9 @@ typedef struct VMIntrospection {
 QDict *qmp_rsp;
 
 bool kvmi_hooked;
+
+GArray *allowed_commands;
+GArray *allowed_events;
 } VMIntrospection;
 
 typedef struct VMIntrospectionClass {
@@ -94,6 +97,8 @@ static bool suspend_pending;
 static bool migrate_pending;
 static bool shutdown_pending;
 
+static __s32 all_IDs = -1;
+
 #define TYPE_VM_INTROSPECTION "introspection"
 
 #define VM_INTROSPECTION(obj) \
@@ -239,6 +244,25 @@ static void prop_set_uint32(Object *obj, Visitor *v, const 
char *name,
 }
 }
 
+static void prop_add_to_array(Object *obj, Visitor *v,
+  const char *name, void *opaque,
+  Error **errp)
+{
+Error *local_err = NULL;
+GArray *arr = opaque;
+uint32_t value;
+
+visit_type_uint32(v, name, &value, &local_err);
+if (!local_err && value == (uint32_t)all_IDs) {
+error_setg(&local_err, "VMI: add %s: invalid id %d", name, value);
+}
+if (local_err) {
+error_propagate(errp, local_err);
+} else {
+g_array_append_val(arr, value);
+}
+}
+
 static bool chardev_is_connected(VMIntrospection *i, Error **errp)
 {
 Object *obj = OBJECT(i->chr);
@@ -286,6 +310,15 @@ static void instance_init(Object *obj)
 object_property_add_str(obj, "chardev", NULL, prop_set_chardev, NULL);
 object_property_add_str(obj, "key", NULL, prop_set_key, NULL);
 
+i->allowed_commands = g_array_new(FALSE, FALSE, sizeof(uint32_t));
+object_property_add(obj, "command", "uint32",
+prop_add_to_array, NULL,
+NULL, i->allowed_commands, NULL);
+i->allowed_events = g_array_new(FALSE, FALSE, sizeof(uint32_t));
+object_property_add(obj, "event", "uint32",
+prop_add_to_array, NULL,
+NULL, i->allowed_events, NULL);
+
 i->handshake_timeout = HANDSHAKE_TIMEOUT_SEC;
 object_property_add(obj, "handshake_timeout", "uint32",
 prop_set_uint32, prop_get_uint32,
@@ -368,6 +401,13 @@ static void instance_finalize(Object *obj)
 VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(obj->class);
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
+if (i->allowed_commands) {
+g_array_free(i->allowed_commands, TRUE);
+}
+if (i->allowed_events) {
+g_array_free(i->allowed_events, TRUE);
+}
+
 g_free(i->chardevid);
 g_free(i->keyid);
 
@@ -531,11 +571,39 @@ static bool validate_handshake(VMIntrospection *i, Error 
**errp)
 return true;
 }
 
+static bool set_allowed_features(int ioctl, GArray *allowed, Error **errp)
+{
+struct kvm_introspection_feature feature;
+gint i;
+
+feature.allow = 1;
+
+if (allowed->len == 0) {
+feature.id = all_IDs;
+if (kvm_vm_ioctl(kvm_state, ioctl, &feature)) {
+goto out_err;
+}
+} else {
+for (i = 0; i < allowed->len; i++) {
+feature.id = g_array_index(allowed, uint32_t, i);
+if (kvm_vm_ioctl(kvm_state, ioctl, &feature)) {
+goto out_err;
+}
+}
+}
+
+return true;
+
+out_err:
+error_setg_errno(errp, -errno,
+ "VMI: feature %d with id %d failed",
+ ioctl, feature.id);
+return false;
+}
+
 static bool connect_kernel(VMIntrospection *i, Error **errp)
 {
-struct kvm_introspection_feature commands, events;
 struct kvm_introspection_hook kernel;
-const __s32 all_ids = -1;
 
 memset(&kernel, 0, sizeof(kernel));
 memcpy(kernel.uuid, &qemu_uuid, sizeof(kernel.uuid));
@@ -553,20 +621,14 @@ static bool connect_kernel(VMIntrospection *i, Error 
**errp)
 
 i->kvmi_hooked = true;
 
-commands.allow = 1;
-commands.id = all_ids;
-if (kvm_vm_ioctl(kvm_state, KVM_INTROSPECTION_COMMAND, &commands)) {
-error_setg_errno(errp, -errno,
- "VMI: ioctl/KVM_INTROSPECTION_COMMAND failed");
+if (!set_allowed_features(KVM_INTROSPECTION_COMMAND,
+ i->allowed_commands, errp)) {
 unhook_kvmi(i);
 return false;
 }
 
-events.allow = 1;
-events.id = all_ids;
-if (kvm_vm_ioctl(kvm_state, KVM_INTROSPECTION_EVENT, &events)) {
-error_setg_errno(errp, -errno,
- "VMI: ioctl/KVM_INTROSPECTION_EVENT failed");
+if (!set_allowed_features(KVM_INTROSPECTION_EVENT,
+ i->allowed_events, errp)) {
 unhook_kvmi(i);
 return false;
 }

[RFC PATCH v1 21/26] kvm: vmi: postpone the OK response from qmp_stop()

2020-04-14 Thread Adalbert Lazăr

The method to postpone the intercepted command (pause/suspend/migrate)
until the introspection tool has the chance to remove its hooks
(e.g. breakpoints) from guest doesn't work on snapshot+memory (at
least as it is done by libvirt/virt-manager 1.3.1). The sequence
qmp_stop()+save_vm+qmp_cont() doesn't wait for the STOP event.  save_vm()
is called right after qmp_stop() returns OK. What we do is postpone
this OK response until the introspection tools finishes the unhook
process.

CC: Markus Armbruster 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 29 +
 accel/stubs/vmi-stubs.c|  7 +++
 include/monitor/monitor.h  |  1 +
 include/sysemu/vmi-intercept.h |  2 +-
 monitor/Makefile.objs  |  2 +-
 monitor/qmp.c  | 11 +++
 monitor/stubs.c|  9 +
 7 files changed, 59 insertions(+), 2 deletions(-)
 create mode 100644 monitor/stubs.c

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index ea7191e48d..01034d460e 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -10,6 +10,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
 #include "qemu/error-report.h"
 #include "qom/object_interfaces.h"
 #include "sysemu/sysemu.h"
@@ -23,6 +24,8 @@
 #include "migration/vmstate.h"
 #include "migration/migration.h"
 #include "migration/misc.h"
+#include "qapi/qmp/qobject.h"
+#include "monitor/monitor.h"
 
 #include "sysemu/vmi-intercept.h"
 #include "sysemu/vmi-handshake.h"
@@ -63,6 +66,9 @@ typedef struct VMIntrospection {
 Notifier migration_state_change;
 bool created_from_command_line;
 
+void *qmp_monitor;
+QDict *qmp_rsp;
+
 bool kvmi_hooked;
 } VMIntrospection;
 
@@ -333,6 +339,8 @@ static void instance_finalize(Object *obj)
 
 error_free(i->init_error);
 
+qobject_unref(i->qmp_rsp);
+
 ic->instance_counter--;
 if (!ic->instance_counter) {
 ic->uniq = NULL;
@@ -506,6 +514,12 @@ static void 
continue_with_the_intercepted_action(VMIntrospection *i)
 
 info_report("VMI: continue with '%s'",
 action_string[i->intercepted_action]);
+
+if (i->qmp_rsp) {
+monitor_qmp_respond_later(i->qmp_monitor, i->qmp_rsp);
+i->qmp_monitor = NULL;
+i->qmp_rsp = NULL;
+}
 }
 
 /*
@@ -676,6 +690,21 @@ static VMIntrospection *vm_introspection_object(void)
 return ic ? ic->uniq : NULL;
 }
 
+bool vm_introspection_qmp_delay(void *mon, QDict *rsp)
+{
+VMIntrospection *i = vm_introspection_object();
+bool intercepted;
+
+intercepted = i && i->intercepted_action == VMI_INTERCEPT_SUSPEND;
+
+if (intercepted) {
+i->qmp_monitor = mon;
+i->qmp_rsp = rsp;
+}
+
+return intercepted;
+}
+
 /*
  * This ioctl succeeds only when KVM signals the introspection tool.
  * (the socket is connected and the event was sent without error).
diff --git a/accel/stubs/vmi-stubs.c b/accel/stubs/vmi-stubs.c
index 1bd93b2ca5..0cb1d6572b 100644
--- a/accel/stubs/vmi-stubs.c
+++ b/accel/stubs/vmi-stubs.c
@@ -1,7 +1,14 @@
 #include "qemu/osdep.h"
+#include "qapi/qmp/qdict.h"
+
 #include "sysemu/vmi-intercept.h"
 
 bool vm_introspection_intercept(VMI_intercept_command ic, Error **errp)
 {
 return false;
 }
+
+bool vm_introspection_qmp_delay(void *mon, QDict *rsp)
+{
+return false;
+}
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 1018d754a6..1b3debc635 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -47,5 +47,6 @@ int monitor_fdset_get_fd(int64_t fdset_id, int flags);
 int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd);
 void monitor_fdset_dup_fd_remove(int dup_fd);
 int64_t monitor_fdset_dup_fd_find(int dup_fd);
+void monitor_qmp_respond_later(void *_mon, QDict *rsp);
 
 #endif /* MONITOR_H */
diff --git a/include/sysemu/vmi-intercept.h b/include/sysemu/vmi-intercept.h
index b4a9a3faa7..4b93d17f2b 100644
--- a/include/sysemu/vmi-intercept.h
+++ b/include/sysemu/vmi-intercept.h
@@ -19,6 +19,6 @@ typedef enum {
 } VMI_intercept_command;
 
 bool vm_introspection_intercept(VMI_intercept_command ic, Error **errp);
-bool vm_introspection_qmp_delay(void *mon, QObject *id, bool resume);
+bool vm_introspection_qmp_delay(void *mon, QDict *rsp);
 
 #endif /* QEMU_VMI_INTERCEPT_H */
diff --git a/monitor/Makefile.objs b/monitor/Makefile.objs
index a8533c9dd7..16652ed162 100644
--- a/monitor/Makefile.objs
+++ b/monitor/Makefile.objs
@@ -3,4 +3,4 @@ common-obj-y += monitor.o qmp.o hmp.o
 common-obj-y += qmp-cmds.o qmp-cmds-control.o
 common-obj-y += hmp-cmds.o
 
-storage-daemon-obj-y += monitor.o qmp.o qmp-cmds-control.o
+storage-daemon-obj-y += monitor.o qmp.o qmp-cmds-control.o stubs.o
diff --git a/monitor/qmp.c b/monitor/qmp.c
index f89e7daf27..fc9ea7eafa 100644
--- a/monitor/qmp.c
+++ b/monitor/qmp.c
@@ -32,6 +32,7 @@
 #include "qapi/qmp/qjson.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qstring.h"
+#include

[RFC PATCH v1 16/26] kvm: vmi: intercept pause/resume

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

Because the introspection tool can run on another VM, suspending either
of these two VMs requires signaling the introspection tool to remove
any changes made to the introspected VM. This is done through the
KVM_INTROSPECTION_PREUNHOOK ioctl. KVM will send an event through the
introspection socket, if active. QEMU will wait for the introspection tool
to let the VM run without being introspected and close the socket.

While the guest is suspended, the socket reconnection is disabled.

CC: Markus Armbruster 
Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 147 +
 accel/stubs/Makefile.objs  |   1 +
 accel/stubs/vmi-stubs.c|   7 ++
 include/sysemu/vmi-intercept.h |  21 +
 monitor/qmp-cmds.c |  10 +++
 5 files changed, 186 insertions(+)
 create mode 100644 accel/stubs/vmi-stubs.c
 create mode 100644 include/sysemu/vmi-intercept.h

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 5beec2b091..151e27265a 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -14,12 +14,14 @@
 #include "qom/object_interfaces.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/reset.h"
+#include "sysemu/runstate.h"
 #include "sysemu/kvm.h"
 #include "crypto/secret.h"
 #include "crypto/hash.h"
 #include "chardev/char.h"
 #include "chardev/char-fe.h"
 
+#include "sysemu/vmi-intercept.h"
 #include "sysemu/vmi-handshake.h"
 
 #define HANDSHAKE_TIMEOUT_SEC 10
@@ -45,6 +47,10 @@ typedef struct VMIntrospection {
 GSource *hsk_timer;
 uint32_t handshake_timeout;
 
+int intercepted_action;
+
+int reconnect_time;
+
 int64_t vm_start_time;
 
 Notifier machine_ready;
@@ -59,6 +65,14 @@ typedef struct VMIntrospectionClass {
 VMIntrospection *uniq;
 } VMIntrospectionClass;
 
+static const char *action_string[] = {
+"none",
+"suspend",
+"resume",
+};
+
+static bool suspend_pending;
+
 #define TYPE_VM_INTROSPECTION "introspection"
 
 #define VM_INTROSPECTION(obj) \
@@ -412,6 +426,39 @@ static bool connect_kernel(VMIntrospection *i, Error 
**errp)
 return true;
 }
 
+static void enable_socket_reconnect(VMIntrospection *i)
+{
+if (i->sock_fd == -1 && i->reconnect_time) {
+qemu_chr_fe_reconnect_time(&i->sock, i->reconnect_time);
+qemu_chr_fe_disconnect(&i->sock);
+i->reconnect_time = 0;
+}
+}
+
+static void maybe_disable_socket_reconnect(VMIntrospection *i)
+{
+if (i->reconnect_time == 0) {
+info_report("VMI: disable socket reconnect");
+i->reconnect_time = qemu_chr_fe_reconnect_time(&i->sock, 0);
+}
+}
+
+static void continue_with_the_intercepted_action(VMIntrospection *i)
+{
+switch (i->intercepted_action) {
+case VMI_INTERCEPT_SUSPEND:
+vm_stop(RUN_STATE_PAUSED);
+break;
+default:
+error_report("VMI: %s: unexpected action %d",
+ __func__, i->intercepted_action);
+break;
+}
+
+info_report("VMI: continue with '%s'",
+action_string[i->intercepted_action]);
+}
+
 /*
  * We should read only the handshake structure,
  * which might have a different size than what we expect.
@@ -495,6 +542,14 @@ static void chr_event_open(VMIntrospection *i)
 {
 Error *local_err = NULL;
 
+if (suspend_pending) {
+info_report("VMI: %s: too soon (suspend=%d)",
+__func__, suspend_pending);
+maybe_disable_socket_reconnect(i);
+qemu_chr_fe_disconnect(&i->sock);
+return;
+}
+
 if (!send_handshake_info(i, &local_err)) {
 error_append_hint(&local_err, "reconnecting\n");
 warn_report_err(local_err);
@@ -522,6 +577,15 @@ static void chr_event_close(VMIntrospection *i)
 }
 
 cancel_handshake_timer(i);
+
+if (suspend_pending) {
+maybe_disable_socket_reconnect(i);
+
+if (i->intercepted_action != VMI_INTERCEPT_NONE) {
+continue_with_the_intercepted_action(i);
+i->intercepted_action = VMI_INTERCEPT_NONE;
+}
+}
 }
 
 static void chr_event(void *opaque, QEMUChrEvent event)
@@ -540,6 +604,89 @@ static void chr_event(void *opaque, QEMUChrEvent event)
 }
 }
 
+static VMIntrospection *vm_introspection_object(void)
+{
+VMIntrospectionClass *ic;
+
+ic = VM_INTROSPECTION_CLASS(object_class_by_name(TYPE_VM_INTROSPECTION));
+
+return ic ? ic->uniq : NULL;
+}
+
+/*
+ * This ioctl succeeds only when KVM signals the introspection tool.
+ * (the socket is connected and the event was sent without error).
+ */
+static bool signal_introspection_tool_to_unhook(VMIntrospection *i)
+{
+int err;
+
+err = kvm_vm_ioctl(kvm_state, KVM_INTROSPECTION_PREUNHOOK, NULL);
+
+return !err;
+}
+
+static bool record_intercept_action(VMI_intercept_command action)
+{
+switch (action) {
+case VMI_INTERCEPT_SUSPEND:
+suspend_pending = true;
+break;
+case VMI_INTERCEPT_RESUME:
+suspend_pending = false;

[RFC PATCH v1 24/26] kvm: vmi: add 'unhook_on_shutdown' property

2020-04-14 Thread Adalbert Lazăr

Some introspection tools can detect when the guest is shutting down.
This new option, 'unhook_on_shutdown' controls if QEMU will notify the
introspection tool on a shutdown command at its level.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 2c6981a4bf..02877eec06 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -58,6 +58,7 @@ typedef struct VMIntrospection {
 GSource *unhook_timer;
 uint32_t unhook_timeout;
 bool async_unhook;
+bool unhook_on_shutdown;
 
 int reconnect_time;
 
@@ -203,6 +204,20 @@ static void prop_set_async_unhook(Object *obj, bool value, 
Error **errp)
 i->async_unhook = value;
 }
 
+static bool prop_get_unhook_on_shutdown(Object *obj, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+return i->unhook_on_shutdown;
+}
+
+static void prop_set_unhook_on_shutdown(Object *obj, bool value, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+i->unhook_on_shutdown = value;
+}
+
 static void prop_get_uint32(Object *obj, Visitor *v, const char *name,
 void *opaque, Error **errp)
 {
@@ -285,6 +300,11 @@ static void instance_init(Object *obj)
  prop_get_async_unhook,
  prop_set_async_unhook, NULL);
 
+i->unhook_on_shutdown = true;
+object_property_add_bool(obj, "unhook_on_shutdown",
+ prop_get_unhook_on_shutdown,
+ prop_set_unhook_on_shutdown, NULL);
+
 vmstate_register(NULL, 0, &vmstate_introspection, i);
 }
 
@@ -801,6 +821,11 @@ static bool intercept_action(VMIntrospection *i,
 }
 
 switch (action) {
+case VMI_INTERCEPT_SHUTDOWN:
+if (!i->unhook_on_shutdown) {
+return false;
+}
+break;
 case VMI_INTERCEPT_FORCE_RESET:
 disconnect_and_unhook_kvmi(i);
 return false;

[RFC PATCH v1 12/26] kvm: vmi: add 'key' property

2020-04-14 Thread Adalbert Lazăr

The introspection tool can be authenticated if the 'key' parameter is
set with the ID of a secret object holding a shared secret between the
introspection tool and QEMU of the introspected VM.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 66 +
 1 file changed, 66 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 5659663caa..f456ca56ef 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -14,6 +14,8 @@
 #include "qom/object_interfaces.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
+#include "crypto/secret.h"
+#include "crypto/hash.h"
 #include "chardev/char.h"
 #include "chardev/char-fe.h"
 
@@ -31,6 +33,11 @@ typedef struct VMIntrospection {
 CharBackend sock;
 int sock_fd;
 
+char *keyid;
+Object *key;
+uint8_t cookie_hash[QEMU_VMI_COOKIE_HASH_SIZE];
+bool key_with_cookie;
+
 qemu_vmi_from_introspector hsk_in;
 uint64_t hsk_in_read_pos;
 uint64_t hsk_in_read_size;
@@ -109,6 +116,14 @@ static void prop_set_chardev(Object *obj, const char 
*value, Error **errp)
 i->chardevid = g_strdup(value);
 }
 
+static void prop_set_key(Object *obj, const char *value, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+g_free(i->keyid);
+i->keyid = g_strdup(value);
+}
+
 static void prop_get_uint32(Object *obj, Visitor *v, const char *name,
 void *opaque, Error **errp)
 {
@@ -153,6 +168,7 @@ static void instance_init(Object *obj)
 update_vm_start_time(i);
 
 object_property_add_str(obj, "chardev", NULL, prop_set_chardev, NULL);
+object_property_add_str(obj, "key", NULL, prop_set_key, NULL);
 
 i->handshake_timeout = HANDSHAKE_TIMEOUT_SEC;
 object_property_add(obj, "handshake_timeout", "uint32",
@@ -213,6 +229,7 @@ static void instance_finalize(Object *obj)
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
 g_free(i->chardevid);
+g_free(i->keyid);
 
 cancel_handshake_timer(i);
 
@@ -276,6 +293,16 @@ static bool send_handshake_info(VMIntrospection *i, Error 
**errp)
 return true;
 }
 
+static bool validate_handshake_cookie(VMIntrospection *i)
+{
+if (!i->key_with_cookie) {
+return true;
+}
+
+return 0 == memcmp(&i->cookie_hash, &i->hsk_in.cookie_hash,
+   sizeof(i->cookie_hash));
+}
+
 static bool validate_handshake(VMIntrospection *i, Error **errp)
 {
 uint32_t min_accepted_size;
@@ -288,6 +315,11 @@ static bool validate_handshake(VMIntrospection *i, Error 
**errp)
 return false;
 }
 
+if (!validate_handshake_cookie(i)) {
+error_setg(errp, "VMI: received cookie doesn't match");
+return false;
+}
+
 /*
  * Check hsk_in.struct_size and sizeof(hsk_in) before accessing any
  * other fields. We might get fewer bytes from applications using
@@ -468,6 +500,31 @@ static void chr_event(void *opaque, QEMUChrEvent event)
 }
 }
 
+static bool make_cookie_hash(const char *key_id, uint8_t *cookie_hash,
+ Error **errp)
+{
+uint8_t *cookie = NULL, *hash = NULL;
+size_t cookie_size, hash_size = 0;
+bool done = false;
+
+if (qcrypto_secret_lookup(key_id, &cookie, &cookie_size, errp) == 0
+&& qcrypto_hash_bytes(QCRYPTO_HASH_ALG_SHA1,
+  (const char *)cookie, cookie_size,
+  &hash, &hash_size, errp) == 0) {
+if (hash_size == QEMU_VMI_COOKIE_HASH_SIZE) {
+memcpy(cookie_hash, hash, QEMU_VMI_COOKIE_HASH_SIZE);
+done = true;
+} else {
+error_setg(errp, "VMI: hash algorithm size mismatch");
+}
+}
+
+g_free(cookie);
+g_free(hash);
+
+return done;
+}
+
 static Error *vm_introspection_init(VMIntrospection *i)
 {
 Error *err = NULL;
@@ -486,6 +543,15 @@ static Error *vm_introspection_init(VMIntrospection *i)
 return err;
 }
 
+if (i->keyid) {
+if (!make_cookie_hash(i->keyid, i->cookie_hash, &err)) {
+return err;
+}
+i->key_with_cookie = true;
+} else {
+warn_report("VMI: the introspection tool won't be 'authenticated'");
+}
+
 chr = qemu_chr_find(i->chardevid);
 if (!chr) {
 error_setg(&err, "VMI: device '%s' not found", i->chardevid);

[RFC PATCH v1 22/26] kvm: vmi: add 'async_unhook' property

2020-04-14 Thread Adalbert Lazăr

The default method to handle the intercepted commands
(pause/suspend/migrate) might not be the simplest method. We add an
alternative method, used when async_unhook is set to false, that runs
the main loop until the introspection tool finish the unhook process
and closes the introspection socket.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 01034d460e..bee9798e54 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -57,6 +57,7 @@ typedef struct VMIntrospection {
 int intercepted_action;
 GSource *unhook_timer;
 uint32_t unhook_timeout;
+bool async_unhook;
 
 int reconnect_time;
 
@@ -186,6 +187,20 @@ static void prop_set_key(Object *obj, const char *value, 
Error **errp)
 i->keyid = g_strdup(value);
 }
 
+static bool prop_get_async_unhook(Object *obj, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+return i->async_unhook;
+}
+
+static void prop_set_async_unhook(Object *obj, bool value, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+i->async_unhook = value;
+}
+
 static void prop_get_uint32(Object *obj, Visitor *v, const char *name,
 void *opaque, Error **errp)
 {
@@ -263,6 +278,11 @@ static void instance_init(Object *obj)
 prop_set_uint32, prop_get_uint32,
 NULL, &i->unhook_timeout, NULL);
 
+i->async_unhook = true;
+object_property_add_bool(obj, "async_unhook",
+ prop_get_async_unhook,
+ prop_set_async_unhook, NULL);
+
 vmstate_register(NULL, 0, &vmstate_introspection, i);
 }
 
@@ -739,6 +759,19 @@ static bool record_intercept_action(VMI_intercept_command 
action)
 return true;
 }
 
+static void wait_until_the_socket_is_closed(VMIntrospection *i)
+{
+info_report("VMI: start waiting until fd=%d is closed", i->sock_fd);
+
+while (i->sock_fd != -1) {
+main_loop_wait(false);
+}
+
+info_report("VMI: continue with the intercepted action fd=%d", i->sock_fd);
+
+maybe_disable_socket_reconnect(i);
+}
+
 static bool intercept_action(VMIntrospection *i,
  VMI_intercept_command action, Error **errp)
 {
@@ -767,6 +800,11 @@ static bool intercept_action(VMIntrospection *i,
   i->unhook_timeout * 1000,
   unhook_timeout_cbk, i);
 
+if (!i->async_unhook) {
+wait_until_the_socket_is_closed(i);
+return false;
+}
+
 i->intercepted_action = action;
 return true;
 }

[RFC PATCH v1 10/26] kvm: vmi: add the handshake with the introspection tool

2020-04-14 Thread Adalbert Lazăr

QEMU sends the name, the UUID and the VM start time and expects the
hash of a secret shared with the introspection tool that can be used to
authenticate it.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 290 +
 include/sysemu/vmi-handshake.h |  45 +
 2 files changed, 335 insertions(+)
 create mode 100644 include/sysemu/vmi-handshake.h

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 883c666a2a..57ded2f69c 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -8,6 +8,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu-common.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qom/object_interfaces.h"
@@ -16,6 +17,8 @@
 #include "chardev/char.h"
 #include "chardev/char-fe.h"
 
+#include "sysemu/vmi-handshake.h"
+
 typedef struct VMIntrospection {
 Object parent_obj;
 
@@ -23,9 +26,19 @@ typedef struct VMIntrospection {
 
 char *chardevid;
 Chardev *chr;
+CharBackend sock;
+int sock_fd;
+
+qemu_vmi_from_introspector hsk_in;
+uint64_t hsk_in_read_pos;
+uint64_t hsk_in_read_size;
+
+int64_t vm_start_time;
 
 Notifier machine_ready;
 bool created_from_command_line;
+
+bool kvmi_hooked;
 } VMIntrospection;
 
 #define TYPE_VM_INTROSPECTION "introspection"
@@ -50,6 +63,11 @@ static void machine_ready(Notifier *notifier, void *data)
 }
 }
 
+static void update_vm_start_time(VMIntrospection *i)
+{
+i->vm_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+}
+
 static void complete(UserCreatable *uc, Error **errp)
 {
 VMIntrospection *i = VM_INTROSPECTION(uc);
@@ -87,6 +105,13 @@ static void prop_set_chardev(Object *obj, const char 
*value, Error **errp)
 i->chardevid = g_strdup(value);
 }
 
+static bool chardev_is_connected(VMIntrospection *i, Error **errp)
+{
+Object *obj = OBJECT(i->chr);
+
+return obj && object_property_get_bool(obj, "connected", errp);
+}
+
 static void class_init(ObjectClass *oc, void *data)
 {
 UserCreatableClass *uc = USER_CREATABLE_CLASS(oc);
@@ -98,17 +123,60 @@ static void instance_init(Object *obj)
 {
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
+i->sock_fd = -1;
 i->created_from_command_line = (qdev_hotplug == false);
 
+update_vm_start_time(i);
+
 object_property_add_str(obj, "chardev", NULL, prop_set_chardev, NULL);
 }
 
+static void disconnect_chardev(VMIntrospection *i)
+{
+if (chardev_is_connected(i, NULL)) {
+qemu_chr_fe_disconnect(&i->sock);
+}
+}
+
+static void unhook_kvmi(VMIntrospection *i)
+{
+if (i->kvmi_hooked) {
+if (kvm_vm_ioctl(kvm_state, KVM_INTROSPECTION_UNHOOK, NULL)) {
+error_report("VMI: ioctl/KVM_INTROSPECTION_UNHOOK failed, errno 
%d",
+ errno);
+}
+i->kvmi_hooked = false;
+}
+}
+
+static void shutdown_socket_fd(VMIntrospection *i)
+{
+/* signal both ends (kernel, introspector) */
+if (i->sock_fd != -1) {
+shutdown(i->sock_fd, SHUT_RDWR);
+i->sock_fd = -1;
+}
+}
+
+static void disconnect_and_unhook_kvmi(VMIntrospection *i)
+{
+shutdown_socket_fd(i);
+disconnect_chardev(i);
+unhook_kvmi(i);
+}
+
 static void instance_finalize(Object *obj)
 {
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
 g_free(i->chardevid);
 
+if (i->chr) {
+shutdown_socket_fd(i);
+qemu_chr_fe_deinit(&i->sock, true);
+unhook_kvmi(i);
+}
+
 error_free(i->init_error);
 }
 
@@ -132,6 +200,210 @@ static void register_types(void)
 
 type_init(register_types);
 
+static bool send_handshake_info(VMIntrospection *i, Error **errp)
+{
+qemu_vmi_to_introspector send = {};
+const char *vm_name;
+int r;
+
+send.struct_size = sizeof(send);
+send.start_time = i->vm_start_time;
+memcpy(&send.uuid, &qemu_uuid, sizeof(send.uuid));
+vm_name = qemu_get_vm_name();
+if (vm_name) {
+snprintf(send.name, sizeof(send.name), "%s", vm_name);
+send.name[sizeof(send.name) - 1] = 0;
+}
+
+r = qemu_chr_fe_write_all(&i->sock, (uint8_t *)&send, sizeof(send));
+if (r != sizeof(send)) {
+error_setg_errno(errp, errno, "VMI: error writing to '%s'",
+ i->chardevid);
+return false;
+}
+
+/* tcp_chr_write may call tcp_chr_disconnect/CHR_EVENT_CLOSED */
+if (!chardev_is_connected(i, errp)) {
+error_append_hint(errp, "VMI: qemu_chr_fe_write_all() failed");
+return false;
+}
+
+return true;
+}
+
+static bool validate_handshake(VMIntrospection *i, Error **errp)
+{
+uint32_t min_accepted_size;
+
+min_accepted_size = offsetof(qemu_vmi_from_introspector, cookie_hash)
++ QEMU_VMI_COOKIE_HASH_SIZE;
+
+if (i->hsk_in.struct_size < min_accepted_size) {
+error_setg(errp, "VMI: not enough or invalid handshake data");
+return false;
+}
+
+/*
+ * Check hsk_in.struct_size and sizeof(hsk_in) before accessing any
+

[RFC PATCH v1 23/26] kvm: vmi: intercept shutdown

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

On shutdown, it is desirable that the introspection tool removes
its changes from the introspected VM, so that they don't reach the
hibernation file.

CC: Markus Armbruster 
Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 31 +++
 include/sysemu/vmi-intercept.h |  1 +
 monitor/qmp-cmds.c |  4 
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index bee9798e54..2c6981a4bf 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -85,10 +85,12 @@ static const char *action_string[] = {
 "resume",
 "force-reset",
 "migrate",
+"shutdown",
 };
 
 static bool suspend_pending;
 static bool migrate_pending;
+static bool shutdown_pending;
 
 #define TYPE_VM_INTROSPECTION "introspection"
 
@@ -511,6 +513,17 @@ static void enable_socket_reconnect(VMIntrospection *i)
 
 static void maybe_disable_socket_reconnect(VMIntrospection *i)
 {
+if (shutdown_pending) {
+/*
+ * We've got the shutdown notification, but the guest might not stop.
+ * We already caused the introspection tool to unhook
+ * because shutdown_pending was set.
+ * Let the socket connect again just in case the guest doesn't stop.
+ */
+shutdown_pending = false;
+return;
+}
+
 if (i->reconnect_time == 0) {
 info_report("VMI: disable socket reconnect");
 i->reconnect_time = qemu_chr_fe_reconnect_time(&i->sock, 0);
@@ -526,6 +539,9 @@ static void 
continue_with_the_intercepted_action(VMIntrospection *i)
 case VMI_INTERCEPT_MIGRATE:
 start_live_migration_thread(migrate_get_current());
 break;
+case VMI_INTERCEPT_SHUTDOWN:
+qemu_system_powerdown_request();
+break;
 default:
 error_report("VMI: %s: unexpected action %d",
  __func__, i->intercepted_action);
@@ -625,9 +641,10 @@ static void chr_event_open(VMIntrospection *i)
 {
 Error *local_err = NULL;
 
-if (suspend_pending || migrate_pending) {
-info_report("VMI: %s: too soon (suspend=%d, migrate=%d)",
-__func__, suspend_pending, migrate_pending);
+if (suspend_pending || migrate_pending || shutdown_pending) {
+info_report("VMI: %s: too soon (suspend=%d, migrate=%d, shutdown=%d)",
+__func__, suspend_pending, migrate_pending,
+shutdown_pending);
 maybe_disable_socket_reconnect(i);
 qemu_chr_fe_disconnect(&i->sock);
 return;
@@ -662,7 +679,7 @@ static void chr_event_close(VMIntrospection *i)
 cancel_unhook_timer(i);
 cancel_handshake_timer(i);
 
-if (suspend_pending || migrate_pending) {
+if (suspend_pending || migrate_pending || shutdown_pending) {
 maybe_disable_socket_reconnect(i);
 
 if (i->intercepted_action != VMI_INTERCEPT_NONE) {
@@ -752,6 +769,9 @@ static bool record_intercept_action(VMI_intercept_command 
action)
 case VMI_INTERCEPT_MIGRATE:
 migrate_pending = true;
 break;
+case VMI_INTERCEPT_SHUTDOWN:
+shutdown_pending = true;
+break;
 default:
 return false;
 }
@@ -839,6 +859,9 @@ static void vm_introspection_reset(void *opaque)
 }
 
 update_vm_start_time(i);
+
+/* warm reset triggered by user */
+shutdown_pending = false;
 }
 
 static bool make_cookie_hash(const char *key_id, uint8_t *cookie_hash,
diff --git a/include/sysemu/vmi-intercept.h b/include/sysemu/vmi-intercept.h
index 4b93d17f2b..da086d7a04 100644
--- a/include/sysemu/vmi-intercept.h
+++ b/include/sysemu/vmi-intercept.h
@@ -16,6 +16,7 @@ typedef enum {
 VMI_INTERCEPT_RESUME,
 VMI_INTERCEPT_FORCE_RESET,
 VMI_INTERCEPT_MIGRATE,
+VMI_INTERCEPT_SHUTDOWN,
 } VMI_intercept_command;
 
 bool vm_introspection_intercept(VMI_intercept_command ic, Error **errp);
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index d164635b5f..333a4a0ecc 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -107,6 +107,10 @@ void qmp_system_reset(Error **errp)
 
 void qmp_system_powerdown(Error **errp)
 {
+if (vm_introspection_intercept(VMI_INTERCEPT_SHUTDOWN, errp)) {
+return;
+}
+
 qemu_system_powerdown_request();
 }

[RFC PATCH v1 19/26] kvm: vmi: intercept force-reset

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

On forced reset, KVM and the instrospection tool must clean-up the
introspection structures. An important thing that must by done by KVM
is to unlink the shared memory pages (the introspection tool
can map memory pages from the introspected VM in its own process/VM).

CC: Markus Armbruster 
Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 6 ++
 include/sysemu/vmi-intercept.h | 2 ++
 monitor/qmp-cmds.c | 4 
 3 files changed, 12 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index e511558f3d..90906478b4 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -73,6 +73,7 @@ static const char *action_string[] = {
 "none",
 "suspend",
 "resume",
+"force-reset",
 };
 
 static bool suspend_pending;
@@ -677,6 +678,8 @@ static bool record_intercept_action(VMI_intercept_command 
action)
 case VMI_INTERCEPT_RESUME:
 suspend_pending = false;
 break;
+case VMI_INTERCEPT_FORCE_RESET:
+break;
 default:
 return false;
 }
@@ -693,6 +696,9 @@ static bool intercept_action(VMIntrospection *i,
 }
 
 switch (action) {
+case VMI_INTERCEPT_FORCE_RESET:
+disconnect_and_unhook_kvmi(i);
+return false;
 case VMI_INTERCEPT_RESUME:
 enable_socket_reconnect(i);
 return false;
diff --git a/include/sysemu/vmi-intercept.h b/include/sysemu/vmi-intercept.h
index 06998ff18a..ef591b49e7 100644
--- a/include/sysemu/vmi-intercept.h
+++ b/include/sysemu/vmi-intercept.h
@@ -14,8 +14,10 @@ typedef enum {
 VMI_INTERCEPT_NONE = 0,
 VMI_INTERCEPT_SUSPEND,
 VMI_INTERCEPT_RESUME,
+VMI_INTERCEPT_FORCE_RESET,
 } VMI_intercept_command;
 
 bool vm_introspection_intercept(VMI_intercept_command ic, Error **errp);
+bool vm_introspection_qmp_delay(void *mon, QObject *id, bool resume);
 
 #endif /* QEMU_VMI_INTERCEPT_H */
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index eabd20fca3..d164635b5f 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -98,6 +98,10 @@ void qmp_stop(Error **errp)
 
 void qmp_system_reset(Error **errp)
 {
+if (vm_introspection_intercept(VMI_INTERCEPT_FORCE_RESET, errp)) {
+return;
+}
+
 qemu_system_reset_request(SHUTDOWN_CAUSE_HOST_QMP_SYSTEM_RESET);
 }

[RFC PATCH v1 20/26] kvm: vmi: intercept live migration

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

It is possible that the introspection tool has made some changes inside
the introspected VM which can make the guest crash if the introspection
connection is suddenly closed.

When the live migration starts, for now, the introspection tool is
signaled to remove its hooks from the introspected VM.

CC: Juan Quintela 
CC: "Dr. David Alan Gilbert" 
Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 31 +++
 include/sysemu/vmi-intercept.h |  1 +
 migration/migration.c  | 18 +++---
 migration/migration.h  |  2 ++
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 90906478b4..ea7191e48d 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -21,6 +21,8 @@
 #include "chardev/char.h"
 #include "chardev/char-fe.h"
 #include "migration/vmstate.h"
+#include "migration/migration.h"
+#include "migration/misc.h"
 
 #include "sysemu/vmi-intercept.h"
 #include "sysemu/vmi-handshake.h"
@@ -58,6 +60,7 @@ typedef struct VMIntrospection {
 int64_t vm_start_time;
 
 Notifier machine_ready;
+Notifier migration_state_change;
 bool created_from_command_line;
 
 bool kvmi_hooked;
@@ -74,9 +77,11 @@ static const char *action_string[] = {
 "suspend",
 "resume",
 "force-reset",
+"migrate",
 };
 
 static bool suspend_pending;
+static bool migrate_pending;
 
 #define TYPE_VM_INTROSPECTION "introspection"
 
@@ -88,6 +93,15 @@ static bool suspend_pending;
 static Error *vm_introspection_init(VMIntrospection *i);
 static void vm_introspection_reset(void *opaque);
 
+static void migration_state_notifier(Notifier *notifier, void *data)
+{
+MigrationState *s = data;
+
+if (migration_has_failed(s)) {
+migrate_pending = false;
+}
+}
+
 static void machine_ready(Notifier *notifier, void *data)
 {
 VMIntrospection *i = container_of(notifier, VMIntrospection, 
machine_ready);
@@ -144,6 +158,9 @@ static void complete(UserCreatable *uc, Error **errp)
 
 ic->uniq = i;
 
+i->migration_state_change.notify = migration_state_notifier;
+add_migration_state_change_notifier(&i->migration_state_change);
+
 qemu_register_reset(vm_introspection_reset, i);
 }
 
@@ -478,6 +495,9 @@ static void 
continue_with_the_intercepted_action(VMIntrospection *i)
 case VMI_INTERCEPT_SUSPEND:
 vm_stop(RUN_STATE_PAUSED);
 break;
+case VMI_INTERCEPT_MIGRATE:
+start_live_migration_thread(migrate_get_current());
+break;
 default:
 error_report("VMI: %s: unexpected action %d",
  __func__, i->intercepted_action);
@@ -571,9 +591,9 @@ static void chr_event_open(VMIntrospection *i)
 {
 Error *local_err = NULL;
 
-if (suspend_pending) {
-info_report("VMI: %s: too soon (suspend=%d)",
-__func__, suspend_pending);
+if (suspend_pending || migrate_pending) {
+info_report("VMI: %s: too soon (suspend=%d, migrate=%d)",
+__func__, suspend_pending, migrate_pending);
 maybe_disable_socket_reconnect(i);
 qemu_chr_fe_disconnect(&i->sock);
 return;
@@ -608,7 +628,7 @@ static void chr_event_close(VMIntrospection *i)
 cancel_unhook_timer(i);
 cancel_handshake_timer(i);
 
-if (suspend_pending) {
+if (suspend_pending || migrate_pending) {
 maybe_disable_socket_reconnect(i);
 
 if (i->intercepted_action != VMI_INTERCEPT_NONE) {
@@ -680,6 +700,9 @@ static bool record_intercept_action(VMI_intercept_command 
action)
 break;
 case VMI_INTERCEPT_FORCE_RESET:
 break;
+case VMI_INTERCEPT_MIGRATE:
+migrate_pending = true;
+break;
 default:
 return false;
 }
diff --git a/include/sysemu/vmi-intercept.h b/include/sysemu/vmi-intercept.h
index ef591b49e7..b4a9a3faa7 100644
--- a/include/sysemu/vmi-intercept.h
+++ b/include/sysemu/vmi-intercept.h
@@ -15,6 +15,7 @@ typedef enum {
 VMI_INTERCEPT_SUSPEND,
 VMI_INTERCEPT_RESUME,
 VMI_INTERCEPT_FORCE_RESET,
+VMI_INTERCEPT_MIGRATE,
 } VMI_intercept_command;
 
 bool vm_introspection_intercept(VMI_intercept_command ic, Error **errp);
diff --git a/migration/migration.c b/migration/migration.c
index 187ac0410c..222037d739 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -55,6 +55,8 @@
 #include "qemu/queue.h"
 #include "multifd.h"
 
+#include "sysemu/vmi-intercept.h"
+
 #define MAX_THROTTLE  (32 << 20)  /* Migration transfer speed throttling */
 
 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
@@ -3471,6 +3473,13 @@ static void *migration_thread(void *opaque)
 return NULL;
 }
 
+void start_live_migration_thread(MigrationState *s)
+{
+qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
+QEMU_THREAD_JOINABLE);
+s->migration_thread_running = true;
+}
+
 void migrate_fd_conne

[RFC PATCH v1 00/26] VM introspection

2020-04-14 Thread Adalbert Lazăr

The KVM introspection subsystem provides a facility for applications
running on the host or in a separate VM, to control the execution of
other VMs (pause, resume, shutdown), query the state of the vCPUs (GPRs,
MSRs etc.), alter the page access bits in the shadow page tables (only
for the hardware backed ones, eg. Intel's EPT) and receive notifications
when events of interest have taken place (shadow page table level faults,
key MSR writes, hypercalls etc.).

This is the userspace part of the KVM introspection API already posted
on the KVM list[1]. Thanks to Samuel Laurén and Mathieu Tarral, this
new VMI API has been integrated into the KVM-VMI[2] project. The pull
request into the libVMI[3] library is under review.

As suggested by Stefan Hajnoczi and Paolo Bonzini, the connection with the
introspection tool is initiated by the QEMU process of the introspected VM
using a socket. After the handshake, QEMU will hand over the file
descriptor to KVM. From this point, the introspection tool will use
the socket to send introspection commands (read/write guest memory, set
page access, etc.) directly to KVM and to receive introspection events
(breakpoint, page fault, etc.). However, for some user actions such
as pause, suspend, live migration, etc., we rely on QEMU to notify KVM,
that will notify the introspection tool, to remove the changes made to
the guest, so that the guest can run when the introspection channel
is disconnected.

The patches were tested with QEMU 2.12 (through libvirt 1.3.1) and
summarly tested with 5.0.0-rc2, except for the last two patches (25
and 26) which were not tested at all, but still included for the initial
feedback.

Patches 01-06 add some extensions to the current code, which may or
may not be needed for the next patches, but we're looking forward for your
comments about these. Except for patch 6, all are chardev/socket related.

Patch 07 adds the KVM ioctls for VM introspection:
  - KVM_INTROSPECTION_HOOK used to hand over the file descriptor
  - KVM_INTROSPETION_PREUNHOOK used on pause/suspend/live migration
  - KVM_INTROSPECTION_UNHOOK used to clean-up the introspection structures
from KVM
  - KVM_INTROSPECTION_COMMAND and KVM_INTROSPECTION_EVENT used to mark the
the introspection commands/events that are allowed.

Patch 08 and 09 introduce the newly added introspection object. Patch 08
contains the usage documenation of this object with all the parameters
that will be added by the next patches. We've tested the creation of
this object through QMP/libvirt and we rely on this to start the VM
introspection for any running VM.

Patches 10-12 add the handshake, the authentication of the introspection
tool and the hand over to KVM.

Patches 13-15 add some safe guards (block the destruction of the
introspection object if the introspection socket is connected and
allow only one instance of the introspection object) and force the
socket reconnection on guest reset. Blocking the destruction of the
introspection object might not be enough, because we also want to block
the destruction of the introspection socket. Or it might be too much,
because this can be done through QMP, and whoever has access to it
may crash the guest in multiple ways.

Patches 16-17 add the first intercepted commands (pause/resume) and
introduce one of the method we use to delay intercepted commands
until the introspection tool has a chance to react.

Patch 18 adds the information we save with the VM snapshot,
the VM start time.

Patches 19-20 add the interception of force-reset and live migration
commands.

Patch 21 adds an workaround to block the snapshots with memory done by
libvirt until the introspection tool has a chance to react. It hasn't
been tested with 5.0.0-rc2. For 2.12 the patch is slightly bigger.

Patch 22 adds a second method of delaying the intercepted commands,
by running the main loop.

Patches 23-24 add the interception of the shutdown command, which doesn't
seems to be done right because the shutdown signal might not be delivered
to the guest, not to mention that is desirable to catch all sources that
my trigger the shudown.

Patch 25, which is not tested, extends the handshake structures to send
the e820 table (for the x86* architectures).

Patch 26, adds the properties to control what introspection commands
and what introspection events are allowed for this guest.

[1]: https://lore.kernel.org/kvm/20200330101308.21702-1-ala...@bitdefender.com/
[2]: https://github.com/KVM-VMI/kvm-vmi
[3]: https://github.com/libvmi/libvmi

Adalbert Lazăr (20):
  chardev: tcp: allow to change the reconnect timer
  char-socket: allow vsock parameters (cid, port)
  char-socket: fix the client mode when created through QMP
  char-socket: add 'reconnecting' property
  char-socket: add 'fd' property
  E820: extend the table access interface
  linux-headers: update with VM introspection interface
  kvm: introduce the VM introspection object
  kvm: vmi: add the handshake with the introspection tool
  kvm: vmi:

[RFC PATCH v1 25/26] kvm: vmi: extend handshake to include the e820 table

2020-04-14 Thread Adalbert Lazăr

The introspection tool can use the e820 table to avoid accessing
(read/write) or modifying access (rwx) for reserved memory pages.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c| 68 ++
 include/sysemu/vmi-handshake.h | 23 +++-
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 02877eec06..f70d78848a 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -26,6 +26,7 @@
 #include "migration/misc.h"
 #include "qapi/qmp/qobject.h"
 #include "monitor/monitor.h"
+#include "hw/i386/e820_memory_layout.h"
 
 #include "sysemu/vmi-intercept.h"
 #include "sysemu/vmi-handshake.h"
@@ -412,23 +413,74 @@ static void register_types(void)
 
 type_init(register_types);
 
+static uint8_t handshake_cpu_type(void)
+{
+#ifdef TARGET_X86_64
+return QEMU_VMI_CPU_TYPE_X86_64;
+#elif TARGET_I386
+return QEMU_VMI_CPU_TYPE_I386;
+#else
+return QEMU_VMI_CPU_TYPE_UNKNOWN;
+#endif
+}
+
+static int cmp_address(const void *a, const void *b)
+{
+uint64_t addr_a = ((qemu_vmi_e820_entry *)a)->address;
+uint64_t addr_b = ((qemu_vmi_e820_entry *)b)->address;
+
+return (addr_a > addr_b) - (addr_a < addr_b);
+}
+
+static void fill_e820_info(qemu_vmi_e820_entry *dest, int n)
+{
+int idx;
+
+for (idx = 0; idx < n; idx++)
+e820_get_entry2(idx, &dest[idx].type, &dest[idx].address,
+&dest[idx].length);
+
+qsort(dest, n, sizeof(*dest), cmp_address);
+}
+
 static bool send_handshake_info(VMIntrospection *i, Error **errp)
 {
-qemu_vmi_to_introspector send = {};
+qemu_vmi_to_introspector *send;
+int max_n_e820, n_e820;
 const char *vm_name;
+size_t send_sz;
 int r;
 
-send.struct_size = sizeof(send);
-send.start_time = i->vm_start_time;
-memcpy(&send.uuid, &qemu_uuid, sizeof(send.uuid));
+max_n_e820 = 8 * sizeof(((qemu_vmi_to_introspector *)0)->arch.e820_count);
+n_e820 = e820_get_num_entries();
+
+if (n_e820 < 0 || n_e820 > max_n_e820) {
+warn_report("VMI: discard e820 info (size %d, max %d)",
+n_e820, max_n_e820);
+n_e820 = 0;
+}
+
+send_sz = sizeof(*send) + n_e820 * sizeof(qemu_vmi_e820_entry);
+
+send = g_malloc0(send_sz);
+
+send->struct_size = send_sz;
+send->start_time = i->vm_start_time;
+send->cpu_type = handshake_cpu_type();
+memcpy(&send->uuid, &qemu_uuid, sizeof(send->uuid));
 vm_name = qemu_get_vm_name();
 if (vm_name) {
-snprintf(send.name, sizeof(send.name), "%s", vm_name);
-send.name[sizeof(send.name) - 1] = 0;
+snprintf(send->name, sizeof(send->name), "%s", vm_name);
+send->name[sizeof(send->name) - 1] = 0;
+}
+send->arch.e820_count = n_e820;
+if (n_e820) {
+fill_e820_info(send->arch.e820_entries, n_e820);
 }
 
-r = qemu_chr_fe_write_all(&i->sock, (uint8_t *)&send, sizeof(send));
-if (r != sizeof(send)) {
+r = qemu_chr_fe_write_all(&i->sock, (uint8_t *)send, send_sz);
+g_free(send);
+if (r != send_sz) {
 error_setg_errno(errp, errno, "VMI: error writing to '%s'",
  i->chardevid);
 return false;
diff --git a/include/sysemu/vmi-handshake.h b/include/sysemu/vmi-handshake.h
index 19bdfb6740..3c5201d37b 100644
--- a/include/sysemu/vmi-handshake.h
+++ b/include/sysemu/vmi-handshake.h
@@ -9,6 +9,25 @@
 enum { QEMU_VMI_NAME_SIZE = 64 };
 enum { QEMU_VMI_COOKIE_HASH_SIZE = 20};
 
+enum {
+QEMU_VMI_CPU_TYPE_I386 = 0,
+QEMU_VMI_CPU_TYPE_X86_64 = 1,
+QEMU_VMI_CPU_TYPE_UNKNOWN = 255
+};
+
+typedef struct qemu_vmi_e820_entry {
+uint64_t address;
+uint64_t length;
+uint32_t type;
+uint32_t padding;
+} qemu_vmi_e820_entry;
+
+typedef struct qemu_vmi_to_introspector_x86 {
+   uint8_t e820_count;
+   uint8_t padding[3];
+   qemu_vmi_e820_entry e820_entries[0];
+} qemu_vmi_to_introspector_x86;
+
 /**
  * qemu_vmi_to_introspector:
  *
@@ -22,9 +41,11 @@ enum { QEMU_VMI_COOKIE_HASH_SIZE = 20};
 typedef struct qemu_vmi_to_introspector {
 uint32_t struct_size;
 uint8_t  uuid[16];
-uint32_t padding;
+uint8_t  cpu_type;
+uint8_t  padding[3];
 int64_t  start_time;
 char name[QEMU_VMI_NAME_SIZE];
+qemu_vmi_to_introspector_x86 arch;
 /* ... */
 } qemu_vmi_to_introspector;

[RFC PATCH v1 11/26] kvm: vmi: add 'handshake_timeout' property

2020-04-14 Thread Adalbert Lazăr

By having a timer during handshake, the blocked connections can be
restored.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 66 -
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 57ded2f69c..5659663caa 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -19,6 +19,8 @@
 
 #include "sysemu/vmi-handshake.h"
 
+#define HANDSHAKE_TIMEOUT_SEC 10
+
 typedef struct VMIntrospection {
 Object parent_obj;
 
@@ -32,6 +34,8 @@ typedef struct VMIntrospection {
 qemu_vmi_from_introspector hsk_in;
 uint64_t hsk_in_read_pos;
 uint64_t hsk_in_read_size;
+GSource *hsk_timer;
+uint32_t handshake_timeout;
 
 int64_t vm_start_time;
 
@@ -105,6 +109,26 @@ static void prop_set_chardev(Object *obj, const char 
*value, Error **errp)
 i->chardevid = g_strdup(value);
 }
 
+static void prop_get_uint32(Object *obj, Visitor *v, const char *name,
+void *opaque, Error **errp)
+{
+uint32_t *value = opaque;
+
+visit_type_uint32(v, name, value, errp);
+}
+
+static void prop_set_uint32(Object *obj, Visitor *v, const char *name,
+void *opaque, Error **errp)
+{
+uint32_t *value = opaque;
+Error *local_err = NULL;
+
+visit_type_uint32(v, name, value, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+}
+}
+
 static bool chardev_is_connected(VMIntrospection *i, Error **errp)
 {
 Object *obj = OBJECT(i->chr);
@@ -129,6 +153,11 @@ static void instance_init(Object *obj)
 update_vm_start_time(i);
 
 object_property_add_str(obj, "chardev", NULL, prop_set_chardev, NULL);
+
+i->handshake_timeout = HANDSHAKE_TIMEOUT_SEC;
+object_property_add(obj, "handshake_timeout", "uint32",
+prop_set_uint32, prop_get_uint32,
+NULL, &i->handshake_timeout, NULL);
 }
 
 static void disconnect_chardev(VMIntrospection *i)
@@ -165,12 +194,28 @@ static void disconnect_and_unhook_kvmi(VMIntrospection *i)
 unhook_kvmi(i);
 }
 
+static void cancel_timer(GSource *timer)
+{
+if (timer) {
+g_source_destroy(timer);
+g_source_unref(timer);
+}
+}
+
+static void cancel_handshake_timer(VMIntrospection *i)
+{
+cancel_timer(i->hsk_timer);
+i->hsk_timer = NULL;
+}
+
 static void instance_finalize(Object *obj)
 {
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
 g_free(i->chardevid);
 
+cancel_handshake_timer(i);
+
 if (i->chr) {
 shutdown_socket_fd(i);
 qemu_chr_fe_deinit(&i->sock, true);
@@ -303,7 +348,7 @@ static int chr_can_read(void *opaque)
 {
 VMIntrospection *i = opaque;
 
-if (i->sock_fd == -1) {
+if (i->hsk_timer == NULL || i->sock_fd == -1) {
 return 0;
 }
 
@@ -356,10 +401,24 @@ static void chr_read(void *opaque, const uint8_t *buf, 
int size)
 }
 
 if (enough_bytes_for_handshake(i)) {
+cancel_handshake_timer(i);
 validate_and_connect(i);
 }
 }
 
+static gboolean chr_timeout(gpointer opaque)
+{
+VMIntrospection *i = opaque;
+
+warn_report("VMI: the handshake takes too long");
+
+g_source_unref(i->hsk_timer);
+i->hsk_timer = NULL;
+
+disconnect_and_unhook_kvmi(i);
+return FALSE;
+}
+
 static void chr_event_open(VMIntrospection *i)
 {
 Error *local_err = NULL;
@@ -378,6 +437,9 @@ static void chr_event_open(VMIntrospection *i)
 memset(&i->hsk_in, 0, sizeof(i->hsk_in));
 i->hsk_in_read_pos = 0;
 i->hsk_in_read_size = 0;
+i->hsk_timer = qemu_chr_timeout_add_ms(i->chr,
+   i->handshake_timeout * 1000,
+   chr_timeout, i);
 }
 
 static void chr_event_close(VMIntrospection *i)
@@ -386,6 +448,8 @@ static void chr_event_close(VMIntrospection *i)
 warn_report("VMI: introspection tool disconnected");
 disconnect_and_unhook_kvmi(i);
 }
+
+cancel_handshake_timer(i);
 }
 
 static void chr_event(void *opaque, QEMUChrEvent event)

[RFC PATCH v1 17/26] kvm: vmi: add 'unhook_timeout' property

2020-04-14 Thread Adalbert Lazăr

When the introspection tool has to remove all changes made to the
introspected VM, the guest must run because some hooks can be removed only
in certain conditions. But this shouldn't take too long even with a host
under heavy load. So, if the socket is not closed by the introspection
tool at the end of this unhook process in the time specified by the
unhook_timeout property, QEMU will shutdown the socket.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 151e27265a..1f3aff3bfe 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -25,6 +25,7 @@
 #include "sysemu/vmi-handshake.h"
 
 #define HANDSHAKE_TIMEOUT_SEC 10
+#define UNHOOK_TIMEOUT_SEC 60
 
 typedef struct VMIntrospection {
 Object parent_obj;
@@ -48,6 +49,8 @@ typedef struct VMIntrospection {
 uint32_t handshake_timeout;
 
 int intercepted_action;
+GSource *unhook_timer;
+uint32_t unhook_timeout;
 
 int reconnect_time;
 
@@ -219,6 +222,11 @@ static void instance_init(Object *obj)
 object_property_add(obj, "handshake_timeout", "uint32",
 prop_set_uint32, prop_get_uint32,
 NULL, &i->handshake_timeout, NULL);
+
+i->unhook_timeout = UNHOOK_TIMEOUT_SEC;
+object_property_add(obj, "unhook_timeout", "uint32",
+prop_set_uint32, prop_get_uint32,
+NULL, &i->unhook_timeout, NULL);
 }
 
 static void disconnect_chardev(VMIntrospection *i)
@@ -269,6 +277,12 @@ static void cancel_handshake_timer(VMIntrospection *i)
 i->hsk_timer = NULL;
 }
 
+static void cancel_unhook_timer(VMIntrospection *i)
+{
+cancel_timer(i->unhook_timer);
+i->unhook_timer = NULL;
+}
+
 static void instance_finalize(Object *obj)
 {
 VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(obj->class);
@@ -277,6 +291,7 @@ static void instance_finalize(Object *obj)
 g_free(i->chardevid);
 g_free(i->keyid);
 
+cancel_unhook_timer(i);
 cancel_handshake_timer(i);
 
 if (i->chr) {
@@ -576,6 +591,7 @@ static void chr_event_close(VMIntrospection *i)
 disconnect_and_unhook_kvmi(i);
 }
 
+cancel_unhook_timer(i);
 cancel_handshake_timer(i);
 
 if (suspend_pending) {
@@ -604,6 +620,19 @@ static void chr_event(void *opaque, QEMUChrEvent event)
 }
 }
 
+static gboolean unhook_timeout_cbk(gpointer opaque)
+{
+VMIntrospection *i = opaque;
+
+warn_report("VMI: the introspection tool is too slow");
+
+g_source_unref(i->unhook_timer);
+i->unhook_timer = NULL;
+
+disconnect_and_unhook_kvmi(i);
+return FALSE;
+}
+
 static VMIntrospection *vm_introspection_object(void)
 {
 VMIntrospectionClass *ic;
@@ -663,6 +692,10 @@ static bool intercept_action(VMIntrospection *i,
 return false;
 }
 
+i->unhook_timer = qemu_chr_timeout_add_ms(i->chr,
+  i->unhook_timeout * 1000,
+  unhook_timeout_cbk, i);
+
 i->intercepted_action = action;
 return true;
 }

[RFC PATCH v1 09/26] kvm: introduce the VM introspection object

2020-04-14 Thread Adalbert Lazăr

This is used to initiate the connection with the introspection tool and
hand over the file descriptor to KVM. The object needs a chardev socket
(in client mode) created with the 'reconnect' property set.

CC: Paolo Bonzini 
Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/Makefile.objs |   1 +
 accel/kvm/vmi.c | 168 
 2 files changed, 169 insertions(+)
 create mode 100644 accel/kvm/vmi.c

diff --git a/accel/kvm/Makefile.objs b/accel/kvm/Makefile.objs
index fdfa481578..5e85294eb3 100644
--- a/accel/kvm/Makefile.objs
+++ b/accel/kvm/Makefile.objs
@@ -1,2 +1,3 @@
 obj-y += kvm-all.o
+obj-y += vmi.o
 obj-$(call lnot,$(CONFIG_SEV)) += sev-stub.o
diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
new file mode 100644
index 00..883c666a2a
--- /dev/null
+++ b/accel/kvm/vmi.c
@@ -0,0 +1,168 @@
+/*
+ * VM Introspection
+ *
+ * Copyright (C) 2017-2020 Bitdefender S.R.L.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
+#include "chardev/char.h"
+#include "chardev/char-fe.h"
+
+typedef struct VMIntrospection {
+Object parent_obj;
+
+Error *init_error;
+
+char *chardevid;
+Chardev *chr;
+
+Notifier machine_ready;
+bool created_from_command_line;
+} VMIntrospection;
+
+#define TYPE_VM_INTROSPECTION "introspection"
+
+#define VM_INTROSPECTION(obj) \
+OBJECT_CHECK(VMIntrospection, (obj), TYPE_VM_INTROSPECTION)
+
+static Error *vm_introspection_init(VMIntrospection *i);
+
+static void machine_ready(Notifier *notifier, void *data)
+{
+VMIntrospection *i = container_of(notifier, VMIntrospection, 
machine_ready);
+
+i->init_error = vm_introspection_init(i);
+if (i->init_error) {
+Error *err = error_copy(i->init_error);
+
+error_report_err(err);
+if (i->created_from_command_line) {
+exit(1);
+}
+}
+}
+
+static void complete(UserCreatable *uc, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(uc);
+
+if (!i->chardevid) {
+error_setg(errp, "VMI: chardev is not set");
+return;
+}
+
+i->machine_ready.notify = machine_ready;
+qemu_add_machine_init_done_notifier(&i->machine_ready);
+
+/*
+ * If the introspection object is created while parsing the command line,
+ * the machine_ready callback will be called later. At that time,
+ * it vm_introspection_init() fails, exit() will be called.
+ *
+ * If the introspection object is created through QMP, machine_init_done
+ * is already set and qemu_add_machine_init_done_notifier() will
+ * call our machine_done() callback. If vm_introspection_init() fails,
+ * we don't call exit() and report the error back to the user.
+ */
+if (i->init_error) {
+*errp = i->init_error;
+i->init_error = NULL;
+return;
+}
+}
+
+static void prop_set_chardev(Object *obj, const char *value, Error **errp)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+g_free(i->chardevid);
+i->chardevid = g_strdup(value);
+}
+
+static void class_init(ObjectClass *oc, void *data)
+{
+UserCreatableClass *uc = USER_CREATABLE_CLASS(oc);
+
+uc->complete = complete;
+}
+
+static void instance_init(Object *obj)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+i->created_from_command_line = (qdev_hotplug == false);
+
+object_property_add_str(obj, "chardev", NULL, prop_set_chardev, NULL);
+}
+
+static void instance_finalize(Object *obj)
+{
+VMIntrospection *i = VM_INTROSPECTION(obj);
+
+g_free(i->chardevid);
+
+error_free(i->init_error);
+}
+
+static const TypeInfo info = {
+.name  = TYPE_VM_INTROSPECTION,
+.parent= TYPE_OBJECT,
+.class_init= class_init,
+.instance_size = sizeof(VMIntrospection),
+.instance_finalize = instance_finalize,
+.instance_init = instance_init,
+.interfaces= (InterfaceInfo[]){
+{TYPE_USER_CREATABLE},
+{}
+}
+};
+
+static void register_types(void)
+{
+type_register_static(&info);
+}
+
+type_init(register_types);
+
+static Error *vm_introspection_init(VMIntrospection *i)
+{
+Error *err = NULL;
+int kvmi_version;
+Chardev *chr;
+
+if (!kvm_enabled()) {
+error_setg(&err, "VMI: missing KVM support");
+return err;
+}
+
+kvmi_version = kvm_check_extension(kvm_state, KVM_CAP_INTROSPECTION);
+if (kvmi_version == 0) {
+error_setg(&err,
+   "VMI: missing kernel built with CONFIG_KVM_INTROSPECTION");
+return err;
+}
+
+chr = qemu_chr_find(i->chardevid);
+if (!chr) {
+error_setg(&err, "VMI: device '%s' not found", i->chardevid);
+return err;

[RFC PATCH v1 18/26] kvm: vmi: store/restore 'vm_start_time' on migrate/snapshot

2020-04-14 Thread Adalbert Lazăr

The VM start time sent during handshake can be used by the introspection
tool as a session id.

We save this 'VM start time' with the snapshot in order to be sent again
to the introspection tool when the VM is restored from snapshot and the
introspection connection is reestablished.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 1f3aff3bfe..e511558f3d 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -20,6 +20,7 @@
 #include "crypto/hash.h"
 #include "chardev/char.h"
 #include "chardev/char-fe.h"
+#include "migration/vmstate.h"
 
 #include "sysemu/vmi-intercept.h"
 #include "sysemu/vmi-handshake.h"
@@ -203,6 +204,16 @@ static void class_init(ObjectClass *oc, void *data)
 uc->can_be_deleted = introspection_can_be_deleted;
 }
 
+static const VMStateDescription vmstate_introspection = {
+.name = "vm_introspection",
+.minimum_version_id = 1,
+.version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_INT64(vm_start_time, VMIntrospection),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static void instance_init(Object *obj)
 {
 VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(obj->class);
@@ -227,6 +238,8 @@ static void instance_init(Object *obj)
 object_property_add(obj, "unhook_timeout", "uint32",
 prop_set_uint32, prop_get_uint32,
 NULL, &i->unhook_timeout, NULL);
+
+vmstate_register(NULL, 0, &vmstate_introspection, i);
 }
 
 static void disconnect_chardev(VMIntrospection *i)

[RFC PATCH v1 05/26] char-socket: add 'fd' property

2020-04-14 Thread Adalbert Lazăr

This is used by the VM introspection object, after handshake, to hand
over the file descriptor to KVM.

CC: "Marc-André Lureau" 
CC: Paolo Bonzini 
Signed-off-by: Adalbert Lazăr 
---
 chardev/char-socket.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 22ab242748..76d0fb8839 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -1499,6 +1499,21 @@ static bool char_socket_get_reconnecting(Object *obj, 
Error **errp)
 return s->reconnect_time > 0;
 }
 
+static void
+char_socket_get_fd(Object *obj, Visitor *v, const char *name, void *opaque,
+   Error **errp)
+{
+int fd = -1;
+SocketChardev *s = SOCKET_CHARDEV(obj);
+QIOChannelSocket *sock = QIO_CHANNEL_SOCKET(s->sioc);
+
+if (sock) {
+fd = sock->fd;
+}
+
+visit_type_int32(v, name, &fd, errp);
+}
+
 static int tcp_chr_reconnect_time(Chardev *chr, int secs)
 {
 SocketChardev *s = SOCKET_CHARDEV(chr);
@@ -1539,6 +1554,9 @@ static void char_socket_class_init(ObjectClass *oc, void 
*data)
 object_class_property_add_bool(oc, "reconnecting",
char_socket_get_reconnecting,
NULL, &error_abort);
+
+object_class_property_add(oc, "fd", "int32", char_socket_get_fd,
+  NULL, NULL, NULL, &error_abort);
 }
 
 static const TypeInfo char_socket_type_info = {

[RFC PATCH v1 08/26] kvm: add VM introspection usage documentation

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 qemu-options.hx | 76 +
 1 file changed, 76 insertions(+)

diff --git a/qemu-options.hx b/qemu-options.hx
index 16debd03cb..6c5618e310 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5005,6 +5005,82 @@ SRST
 ::
 
 (qemu) qom-set /objects/iothread1 poll-max-ns 10
+
+``-object 
introspection,id=id,chardev=id[,key=id][,handshake_timeout=seconds][,unhook_timeout=seconds][,command=id[,...]][,event=id[,...]]``
+Defines a VM Introspection (VMI) object that will connect to
+an introspection tool, initiate the handshake and hand over the 
connection
+file descriptor to KVM. The introspection channel will be used by
+the introspection tool to talk directly with KVM. If the VM is paused
+or migrated, QEMU will delay the action, signal KVM, which in turn will
+signal the introspection tool to remove its hooks (e.g. breakpoints
+placed inside the guest).
+
+The ``chardev`` parameter provides the introspection channel.
+This is the id of a previously created chardev socket,
+with a non-zero reconnect parameter.
+
+The ``key`` parameter is an optional secret object used to
+authenticate the instrospection tool.
+
+The ``handshake_timeout`` parameter specify how long will QEMU
+wait for the introspection tool during handshake (default is
+10 seconds).
+
+The ``unhook_timeout` parameter specify how long will QEMU
+wait for the introspection tool on pause/migrate (default is
+60 seconds).
+
+The ``command`` parameter specify an allowed introspection command.
+It can be used multiple times. If omitted, all commands are
+allowed. For example, ``command=10,command=8`` will allow the
+introspection tool to use two commands, KVMI_VCPU_PAUSE(10) and
+KVMI_VM_WRITE_PHYSICAL(8), in addition to those that are used
+to query the API, which are always allowed (KVMI_GET_VERSION,
+KVMI_VM_CHECK_COMMAND and KVMI_VM_CHECK_EVENT).
+
+The ``event` parameter specify an allowed introspection event.
+It can be used multiple times. If omitted, all events
+are allowed. For example, ``event=1,event=3`` will
+allow the introspection tool to receive only two events,
+KVMI_EVENT_PAUSE_VCPU(1) and KVMI_EVENT_BREAKPOINT(3).
+
+VM introspected through a unix socket:
+
+.. parsed-literal::
+
+ # |qemu_system_x86| \
+ ..
+ -chardev 
socket,id=vmi_chardev,type=unix,path=/tmp/vmi-guest1.sock,reconnect=10 \
+ -object introspection,id=vmi,chardev=vmi_chardev
+
+VM introspected by an authenticated introspection tool:
+
+.. parsed-literal::
+
+ # |qemu_system_x86| \
+ ..
+ -chardev 
socket,id=vmi_chardev,type=unix,path=/tmp/vmi-guest1.sock,reconnect=10 \
+ -object secret,id=vmi_key,file=/etc/secret \
+ -object introspection,id=vmi,chardev=vmi_chardev,key=vmi_key
+
+VM introspected through a virtual socket, with the introspection tool
+listening on port 4321 from another VM started with cid=1234:
+
+.. parsed-literal::
+
+ # |qemu_system_x86| \
+ ..
+ -chardev 
socket,id=vmi_chardev,type=vsock,cid=1234,port=4321,reconnect=10 \
+ -object introspection,id=vmi,chardev=vmi_chardev
+
+VM running the introspection tool:
+
+.. parsed-literal::
+
+ # |qemu_system_x86| \
+ ..
+ -device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=1234
+
 ERST

[RFC PATCH v1 15/26] kvm: vmi: reconnect the socket on reset

2020-04-14 Thread Adalbert Lazăr

From: Marian Rotariu 

The guest could be reset from various reasons and by disconnecting the
socket (which would reconnect), KVM and the introspection tool will be
notified and can clean up the introspection structures.

Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 54c56c6e13..5beec2b091 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -13,6 +13,7 @@
 #include "qemu/error-report.h"
 #include "qom/object_interfaces.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/reset.h"
 #include "sysemu/kvm.h"
 #include "crypto/secret.h"
 #include "crypto/hash.h"
@@ -66,6 +67,7 @@ typedef struct VMIntrospectionClass {
 OBJECT_CLASS_CHECK(VMIntrospectionClass, (class), TYPE_VM_INTROSPECTION)
 
 static Error *vm_introspection_init(VMIntrospection *i);
+static void vm_introspection_reset(void *opaque);
 
 static void machine_ready(Notifier *notifier, void *data)
 {
@@ -122,6 +124,8 @@ static void complete(UserCreatable *uc, Error **errp)
 }
 
 ic->uniq = i;
+
+qemu_register_reset(vm_introspection_reset, i);
 }
 
 static void prop_set_chardev(Object *obj, const char *value, Error **errp)
@@ -273,6 +277,8 @@ static void instance_finalize(Object *obj)
 if (!ic->instance_counter) {
 ic->uniq = NULL;
 }
+
+qemu_unregister_reset(vm_introspection_reset, i);
 }
 
 static const TypeInfo info = {
@@ -534,6 +540,18 @@ static void chr_event(void *opaque, QEMUChrEvent event)
 }
 }
 
+static void vm_introspection_reset(void *opaque)
+{
+VMIntrospection *i = opaque;
+
+if (i->sock_fd != -1) {
+info_report("VMI: Reset detected. Closing the socket...");
+disconnect_and_unhook_kvmi(i);
+}
+
+update_vm_start_time(i);
+}
+
 static bool make_cookie_hash(const char *key_id, uint8_t *cookie_hash,
  Error **errp)
 {

[RFC PATCH v1 01/26] chardev: tcp: allow to change the reconnect timer

2020-04-14 Thread Adalbert Lazăr

When the introspected VM is paused/suspended/migrated, the introspection
tool removes its hooks from the guest and closes the connection.
This is detected by KVM, which in turn will clean the introspection
structures. Thanks to the reconnect parameter, the chardev will reconnect
with the introspection tool, which will try to hook the VM again,
assuming that the pause/suspend/migration operation has ended.

With this new feature, we can suspend the reconnection.

CC: "Marc-André Lureau" 
CC: Paolo Bonzini 
Signed-off-by: Adalbert Lazăr 
---
 chardev/char-fe.c | 11 +++
 chardev/char-socket.c | 14 ++
 include/chardev/char-fe.h |  7 +++
 include/chardev/char.h|  1 +
 4 files changed, 33 insertions(+)

diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index f3530a90e6..ac83528078 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -384,3 +384,14 @@ void qemu_chr_fe_disconnect(CharBackend *be)
 CHARDEV_GET_CLASS(chr)->chr_disconnect(chr);
 }
 }
+
+int qemu_chr_fe_reconnect_time(CharBackend *be, int secs)
+{
+Chardev *chr = be->chr;
+
+if (chr && CHARDEV_GET_CLASS(chr)->chr_reconnect_time) {
+return CHARDEV_GET_CLASS(chr)->chr_reconnect_time(chr, secs);
+}
+
+return -1;
+}
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 185fe38dda..bd966aace1 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -1471,6 +1471,19 @@ char_socket_get_connected(Object *obj, Error **errp)
 return s->state == TCP_CHARDEV_STATE_CONNECTED;
 }
 
+static int tcp_chr_reconnect_time(Chardev *chr, int secs)
+{
+SocketChardev *s = SOCKET_CHARDEV(chr);
+
+int old = s->reconnect_time;
+
+if (secs >= 0) {
+s->reconnect_time = secs;
+}
+
+return old;
+}
+
 static void char_socket_class_init(ObjectClass *oc, void *data)
 {
 ChardevClass *cc = CHARDEV_CLASS(oc);
@@ -1481,6 +1494,7 @@ static void char_socket_class_init(ObjectClass *oc, void 
*data)
 cc->chr_write = tcp_chr_write;
 cc->chr_sync_read = tcp_chr_sync_read;
 cc->chr_disconnect = tcp_chr_disconnect;
+cc->chr_reconnect_time = tcp_chr_reconnect_time;
 cc->get_msgfds = tcp_get_msgfds;
 cc->set_msgfds = tcp_set_msgfds;
 cc->chr_add_client = tcp_chr_add_client;
diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h
index a553843364..ff1897040a 100644
--- a/include/chardev/char-fe.h
+++ b/include/chardev/char-fe.h
@@ -135,6 +135,13 @@ void qemu_chr_fe_accept_input(CharBackend *be);
  */
 void qemu_chr_fe_disconnect(CharBackend *be);
 
+/**
+ * qemu_chr_fe_reconnect_time:
+ *
+ * Change the reconnect time and return the old value.
+ */
+int qemu_chr_fe_reconnect_time(CharBackend *be, int secs);
+
 /**
  * qemu_chr_fe_wait_connected:
  *
diff --git a/include/chardev/char.h b/include/chardev/char.h
index 00589a6025..80204d43ae 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -270,6 +270,7 @@ typedef struct ChardevClass {
 int (*chr_add_client)(Chardev *chr, int fd);
 int (*chr_wait_connected)(Chardev *chr, Error **errp);
 void (*chr_disconnect)(Chardev *chr);
+int (*chr_reconnect_time)(Chardev *be, int secs);
 void (*chr_accept_input)(Chardev *chr);
 void (*chr_set_echo)(Chardev *chr, bool echo);
 void (*chr_set_fe_open)(Chardev *chr, int fe_open);

[RFC PATCH v1 04/26] char-socket: add 'reconnecting' property

2020-04-14 Thread Adalbert Lazăr

This is used by the VM introspection object to check if the connection
will be reestablished in case it disconnects from some reason.

The closing of the socket is used by any of the three parties involved,
KVM, the introspection tool and QEMU (eg. on force-reset), to signal
the other parties that the session is over. As such, it is very important
that the socket will reconnect.

CC: "Marc-André Lureau" 
CC: Paolo Bonzini 
Signed-off-by: Adalbert Lazăr 
---
 chardev/char-socket.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index fd0106ab85..22ab242748 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -1492,6 +1492,13 @@ char_socket_get_connected(Object *obj, Error **errp)
 return s->state == TCP_CHARDEV_STATE_CONNECTED;
 }
 
+static bool char_socket_get_reconnecting(Object *obj, Error **errp)
+{
+SocketChardev *s = SOCKET_CHARDEV(obj);
+
+return s->reconnect_time > 0;
+}
+
 static int tcp_chr_reconnect_time(Chardev *chr, int secs)
 {
 SocketChardev *s = SOCKET_CHARDEV(chr);
@@ -1528,6 +1535,10 @@ static void char_socket_class_init(ObjectClass *oc, void 
*data)
 
 object_class_property_add_bool(oc, "connected", char_socket_get_connected,
NULL, &error_abort);
+
+object_class_property_add_bool(oc, "reconnecting",
+   char_socket_get_reconnecting,
+   NULL, &error_abort);
 }
 
 static const TypeInfo char_socket_type_info = {

[RFC PATCH v1 07/26] linux-headers: update with VM introspection interface

2020-04-14 Thread Adalbert Lazăr

Signed-off-by: Adalbert Lazăr 
---
 linux-headers/linux/kvm.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 265099100e..4e5d390640 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1010,6 +1010,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_NISV_TO_USER 177
 #define KVM_CAP_ARM_INJECT_EXT_DABT 178
 #define KVM_CAP_S390_VCPU_RESETS 179
+#define KVM_CAP_INTROSPECTION 180
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1558,6 +1559,25 @@ struct kvm_sev_dbg {
__u32 len;
 };
 
+struct kvm_introspection_hook {
+   __s32 fd;
+   __u32 padding;
+   __u8 uuid[16];
+};
+
+#define KVM_INTROSPECTION_HOOK_IOW(KVMIO, 0xc3, struct 
kvm_introspection_hook)
+#define KVM_INTROSPECTION_UNHOOK  _IO(KVMIO, 0xc4)
+
+struct kvm_introspection_feature {
+   __u32 allow;
+   __s32 id;
+};
+
+#define KVM_INTROSPECTION_COMMAND _IOW(KVMIO, 0xc5, struct 
kvm_introspection_feature)
+#define KVM_INTROSPECTION_EVENT   _IOW(KVMIO, 0xc6, struct 
kvm_introspection_feature)
+
+#define KVM_INTROSPECTION_PREUNHOOK  _IO(KVMIO, 0xc7)
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX   (1 << 2)

[RFC PATCH v1 14/26] kvm: vmi: allow only one instance of the introspection object

2020-04-14 Thread Adalbert Lazăr

Because only one introspection tool must introspect a VM at a given time,
we block the completion of the second instance.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index 2ce8a60565..54c56c6e13 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -52,10 +52,18 @@ typedef struct VMIntrospection {
 bool kvmi_hooked;
 } VMIntrospection;
 
+typedef struct VMIntrospectionClass {
+ObjectClass parent_class;
+uint32_t instance_counter;
+VMIntrospection *uniq;
+} VMIntrospectionClass;
+
 #define TYPE_VM_INTROSPECTION "introspection"
 
 #define VM_INTROSPECTION(obj) \
 OBJECT_CHECK(VMIntrospection, (obj), TYPE_VM_INTROSPECTION)
+#define VM_INTROSPECTION_CLASS(class) \
+OBJECT_CLASS_CHECK(VMIntrospectionClass, (class), TYPE_VM_INTROSPECTION)
 
 static Error *vm_introspection_init(VMIntrospection *i);
 
@@ -81,8 +89,14 @@ static void update_vm_start_time(VMIntrospection *i)
 
 static void complete(UserCreatable *uc, Error **errp)
 {
+VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(OBJECT(uc)->class);
 VMIntrospection *i = VM_INTROSPECTION(uc);
 
+if (ic->instance_counter > 1) {
+error_setg(errp, "VMI: only one introspection object can be created");
+return;
+}
+
 if (!i->chardevid) {
 error_setg(errp, "VMI: chardev is not set");
 return;
@@ -106,6 +120,8 @@ static void complete(UserCreatable *uc, Error **errp)
 i->init_error = NULL;
 return;
 }
+
+ic->uniq = i;
 }
 
 static void prop_set_chardev(Object *obj, const char *value, Error **errp)
@@ -168,8 +184,11 @@ static void class_init(ObjectClass *oc, void *data)
 
 static void instance_init(Object *obj)
 {
+VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(obj->class);
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
+ic->instance_counter++;
+
 i->sock_fd = -1;
 i->created_from_command_line = (qdev_hotplug == false);
 
@@ -234,6 +253,7 @@ static void cancel_handshake_timer(VMIntrospection *i)
 
 static void instance_finalize(Object *obj)
 {
+VMIntrospectionClass *ic = VM_INTROSPECTION_CLASS(obj->class);
 VMIntrospection *i = VM_INTROSPECTION(obj);
 
 g_free(i->chardevid);
@@ -248,12 +268,18 @@ static void instance_finalize(Object *obj)
 }
 
 error_free(i->init_error);
+
+ic->instance_counter--;
+if (!ic->instance_counter) {
+ic->uniq = NULL;
+}
 }
 
 static const TypeInfo info = {
 .name  = TYPE_VM_INTROSPECTION,
 .parent= TYPE_OBJECT,
 .class_init= class_init,
+.class_size= sizeof(VMIntrospectionClass),
 .instance_size = sizeof(VMIntrospection),
 .instance_finalize = instance_finalize,
 .instance_init = instance_init,

[RFC PATCH v1 03/26] char-socket: fix the client mode when created through QMP

2020-04-14 Thread Adalbert Lazăr

qmp_chardev_open_socket() ignores the absence of the 'server' argument
and always switches to listen/server mode.

CC: "Marc-André Lureau" 
CC: Paolo Bonzini 
Signed-off-by: Adalbert Lazăr 
---
 chardev/char-socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 9b2deb0125..fd0106ab85 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -1310,7 +1310,7 @@ static void qmp_chardev_open_socket(Chardev *chr,
 SocketChardev *s = SOCKET_CHARDEV(chr);
 ChardevSocket *sock = backend->u.socket.data;
 bool do_nodelay = sock->has_nodelay ? sock->nodelay : false;
-bool is_listen  = sock->has_server  ? sock->server  : true;
+bool is_listen  = sock->has_server  ? sock->server  : false;
 bool is_telnet  = sock->has_telnet  ? sock->telnet  : false;
 bool is_tn3270  = sock->has_tn3270  ? sock->tn3270  : false;
 bool is_waitconnect = sock->has_wait? sock->wait: false;

[RFC PATCH v1 06/26] E820: extend the table access interface

2020-04-14 Thread Adalbert Lazăr

This new function is necessary for the VM introspection object.
By sending all e820 entries, not just the RAM ones,
the introspection tool can differentiate between
an invalid address and a reserved one.

CC: Paolo Bonzini 
CC: Richard Henderson 
CC: Eduardo Habkost 
CC: "Michael S. Tsirkin" 
CC: Marcel Apfelbaum 
Signed-off-by: Adalbert Lazăr 
---
 hw/i386/e820_memory_layout.c | 12 
 hw/i386/e820_memory_layout.h |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index bcf9eaf837..a875e9e326 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -57,3 +57,15 @@ bool e820_get_entry(int idx, uint32_t type, uint64_t 
*address, uint64_t *length)
 }
 return false;
 }
+
+bool e820_get_entry2(int idx, uint32_t *type, uint64_t *address,
+ uint64_t *length)
+{
+if (idx < e820_entries) {
+*type = le32_to_cpu(e820_table[idx].type);
+*address = le64_to_cpu(e820_table[idx].address);
+*length = le64_to_cpu(e820_table[idx].length);
+return true;
+}
+return false;
+}
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index 2a0ceb8b9c..a4555c21fb 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -36,6 +36,8 @@ int e820_add_entry(uint64_t address, uint64_t length, 
uint32_t type);
 int e820_get_num_entries(void);
 bool e820_get_entry(int index, uint32_t type,
 uint64_t *address, uint64_t *length);
+bool e820_get_entry2(int index, uint32_t *type,
+ uint64_t *address, uint64_t *length);

[RFC PATCH v1 13/26] kvm: vmi: block the object destruction if the chardev is connected

2020-04-14 Thread Adalbert Lazăr

The introspection tool can modify the VM while it is running
(e.g. it can set breakpoints), and when the VM is no longer introspected
these changes need to be removed. Until then, we block the destruction of
the introspection object that would lead to the unexpected shutdown
of the introspection channel.

Signed-off-by: Adalbert Lazăr 
---
 accel/kvm/vmi.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/accel/kvm/vmi.c b/accel/kvm/vmi.c
index f456ca56ef..2ce8a60565 100644
--- a/accel/kvm/vmi.c
+++ b/accel/kvm/vmi.c
@@ -151,11 +151,19 @@ static bool chardev_is_connected(VMIntrospection *i, 
Error **errp)
 return obj && object_property_get_bool(obj, "connected", errp);
 }
 
+static bool introspection_can_be_deleted(UserCreatable *uc)
+{
+VMIntrospection *i = VM_INTROSPECTION(uc);
+
+return !chardev_is_connected(i, NULL);
+}
+
 static void class_init(ObjectClass *oc, void *data)
 {
 UserCreatableClass *uc = USER_CREATABLE_CLASS(oc);
 
 uc->complete = complete;
+uc->can_be_deleted = introspection_can_be_deleted;
 }
 
 static void instance_init(Object *obj)

[RFC PATCH v1 02/26] char-socket: allow vsock parameters (cid, port)

2020-04-14 Thread Adalbert Lazăr

The introspection tool can run in a separate VM and the introspected
VM will establish a connection using a virtual socket.

CC: "Marc-André Lureau" 
CC: Paolo Bonzini 
Signed-off-by: Adalbert Lazăr 
---
 chardev/char-socket.c | 27 ---
 chardev/char.c|  3 +++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index bd966aace1..9b2deb0125 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -23,6 +23,11 @@
  */
 
 #include "qemu/osdep.h"
+
+#ifdef CONFIG_AF_VSOCK
+#include 
+#endif /* CONFIG_AF_VSOCK */
+
 #include "chardev/char.h"
 #include "io/channel-socket.h"
 #include "io/channel-tls.h"
@@ -590,6 +595,14 @@ static char *qemu_chr_compute_filename(SocketChardev *s)
s->is_listen ? ",server" : "",
left, phost, right, pserv);
 
+#ifdef CONFIG_AF_VSOCK
+case AF_VSOCK:
+return g_strdup_printf("vsock:%d:%d%s",
+   ((struct sockaddr_vm *)(ss))->svm_cid,
+   ((struct sockaddr_vm *)(ss))->svm_port,
+   s->is_listen ? ",server" : "");
+#endif
+
 default:
 return g_strdup_printf("unknown");
 }
@@ -1378,18 +1391,19 @@ static void qemu_chr_parse_socket(QemuOpts *opts, 
ChardevBackend *backend,
 {
 const char *path = qemu_opt_get(opts, "path");
 const char *host = qemu_opt_get(opts, "host");
+const char *cid  = qemu_opt_get(opts, "cid");
 const char *port = qemu_opt_get(opts, "port");
 const char *fd = qemu_opt_get(opts, "fd");
 SocketAddressLegacy *addr;
 ChardevSocket *sock;
 
-if ((!!path + !!fd + !!host) != 1) {
+if ((!!path + !!fd + !!host + !!cid) != 1) {
 error_setg(errp,
-   "Exactly one of 'path', 'fd' or 'host' required");
+   "Exactly one of 'path', 'fd', 'cid' or 'host' required");
 return;
 }
 
-if (host && !port) {
+if ((host || cid) && !port) {
 error_setg(errp, "chardev: socket: no port given");
 return;
 }
@@ -1444,6 +1458,13 @@ static void qemu_chr_parse_socket(QemuOpts *opts, 
ChardevBackend *backend,
 .has_ipv6 = qemu_opt_get(opts, "ipv6"),
 .ipv6 = qemu_opt_get_bool(opts, "ipv6", 0),
 };
+} else if (cid) {
+addr->type = SOCKET_ADDRESS_LEGACY_KIND_VSOCK;
+addr->u.vsock.data = g_new0(VsockSocketAddress, 1);
+*addr->u.vsock.data = (VsockSocketAddress) {
+.cid  = g_strdup(cid),
+.port = g_strdup(port),
+};
 } else if (fd) {
 addr->type = SOCKET_ADDRESS_LEGACY_KIND_FD;
 addr->u.fd.data = g_new(String, 1);
diff --git a/chardev/char.c b/chardev/char.c
index e77564060d..39e36ceb97 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -852,6 +852,9 @@ QemuOptsList qemu_chardev_opts = {
 },{
 .name = "host",
 .type = QEMU_OPT_STRING,
+},{
+.name = "cid",
+.type = QEMU_OPT_STRING,
 },{
 .name = "port",
 .type = QEMU_OPT_STRING,

1 2 3 4 >

1 - 100 of 308 matches

Mail list logo