[Qemu-devel] [PATCH v3 21/23] hw/rdma: Do not use bitmap_zero_extend to free bitmap

2018-11-12 Thread Yuval Shaia
bitmap_zero_extend is designed to work for extending, not for
shrinking.
Using g_free instead.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 0a5ab8935a..35a96d9a64 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -43,7 +43,7 @@ static inline void res_tbl_free(RdmaRmResTbl *tbl)
 {
 qemu_mutex_destroy(>lock);
 g_free(tbl->tbl);
-bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0);
+g_free(tbl->bitmap);
 }
 
 static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle)
-- 
2.17.2




[Qemu-devel] [PATCH v3 14/23] hw/rdma: Initialize node_guid from vmxnet3 mac address

2018-11-12 Thread Yuval Shaia
node_guid should be set once device is load.
Make node_guid be GID format (32 bit) of PCI function 0 vmxnet3 device's
MAC.

A new function was added to do the conversion.
So for example the MAC 56:b6:44:e9:62:dc will be converted to GID
54b6:44ff:fee9:62dc.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_utils.h  |  9 +
 hw/rdma/vmw/pvrdma_cmd.c  | 10 --
 hw/rdma/vmw/pvrdma_main.c |  5 -
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h
index 989db249ef..202abb3366 100644
--- a/hw/rdma/rdma_utils.h
+++ b/hw/rdma/rdma_utils.h
@@ -63,4 +63,13 @@ extern unsigned long pr_dbg_cnt;
 void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen);
 void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len);
 
+static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr)
+{
+memcpy(eui, addr, 3);
+eui[3] = 0xFF;
+eui[4] = 0xFE;
+memcpy(eui + 5, addr + 3, 3);
+eui[0] ^= 2;
+}
+
 #endif
diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index a334f6205e..2979582fac 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -592,16 +592,6 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 return -EINVAL;
 }
 
-/* TODO: Since drivers stores node_guid at load_dsr phase then this
- * assignment is not relevant, i need to figure out a way how to
- * retrieve MAC of our netdev */
-if (!cmd->index) {
-dev->node_guid =
-dev->rdma_dev_res.ports[0].gid_tbl[0].gid.global.interface_id;
-pr_dbg("dev->node_guid=0x%llx\n",
-   (long long unsigned int)be64_to_cpu(dev->node_guid));
-}
-
 return 0;
 }
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index fa6468d221..95e9322b7c 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -264,7 +264,7 @@ static void init_dsr_dev_caps(PVRDMADev *dev)
 dsr->caps.sys_image_guid = 0;
 pr_dbg("sys_image_guid=%" PRIx64 "\n", dsr->caps.sys_image_guid);
 
-dsr->caps.node_guid = cpu_to_be64(dev->node_guid);
+dsr->caps.node_guid = dev->node_guid;
 pr_dbg("node_guid=%" PRIx64 "\n", be64_to_cpu(dsr->caps.node_guid));
 
 dsr->caps.phys_port_cnt = MAX_PORTS;
@@ -579,6 +579,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 /* Break if not vmxnet3 device in slot 0 */
 dev->func0 = VMXNET3(pci_get_function_0(pdev));
 
+addrconf_addr_eui48((unsigned char *)>node_guid,
+(const char *)>func0->conf.macaddr.a);
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




Re: [Qemu-devel] [PATCH] slirp: add tftp tracing

2018-11-12 Thread Liam Merwick



On 13/11/2018 07:03, Gerd Hoffmann wrote:

Useful when debugging pxeboot, to see what the guest tries to do.

Signed-off-by: Gerd Hoffmann 


Reviewed-by: Liam Merwick 



---
  Makefile.objs  | 1 +
  slirp/tftp.c   | 3 +++
  slirp/trace-events | 5 +
  3 files changed, 9 insertions(+)
  create mode 100644 slirp/trace-events

diff --git a/Makefile.objs b/Makefile.objs
index 1e1ff387d7..31852eaf8f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -251,6 +251,7 @@ trace-events-subdirs += net
  trace-events-subdirs += qapi
  trace-events-subdirs += qom
  trace-events-subdirs += scsi
+trace-events-subdirs += slirp
  trace-events-subdirs += target/arm
  trace-events-subdirs += target/i386
  trace-events-subdirs += target/mips
diff --git a/slirp/tftp.c b/slirp/tftp.c
index a9bc4bb1b6..735b57aa55 100644
--- a/slirp/tftp.c
+++ b/slirp/tftp.c
@@ -26,6 +26,7 @@
  #include "slirp.h"
  #include "qemu-common.h"
  #include "qemu/cutils.h"
+#include "trace.h"
  
  static inline int tftp_session_in_use(struct tftp_session *spt)

  {
@@ -204,6 +205,7 @@ static void tftp_send_error(struct tftp_session *spt,
struct mbuf *m;
struct tftp_t *tp;
  
+  trace_slirp_tftp_error(msg);

m = m_get(spt->slirp);
  
if (!m) {

@@ -323,6 +325,7 @@ static void tftp_handle_rrq(Slirp *slirp, struct 
sockaddr_storage *srcsas,
break;
  }
}
+  trace_slirp_tftp_rrq(req_fname);
  
/* check mode */

if ((pktlen - k) < 6) {
diff --git a/slirp/trace-events b/slirp/trace-events
new file mode 100644
index 00..ff8f656e8c
--- /dev/null
+++ b/slirp/trace-events
@@ -0,0 +1,5 @@
+# See docs/devel/tracing.txt for syntax documentation.
+
+# slirp/tftp.c
+slirp_tftp_rrq(const char *file) "file: %s"
+slirp_tftp_error(const char *file) "msg: %s"





[Qemu-devel] [PATCH v3 19/23] vl: Introduce shutdown_notifiers

2018-11-12 Thread Yuval Shaia
Notifier will be used for signaling shutdown event to inform system is
shutdown. This will allow devices and other component to run some
cleanup code needed before VM is shutdown.

Signed-off-by: Yuval Shaia 
---
 include/sysemu/sysemu.h |  1 +
 vl.c| 15 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 8d6095d98b..0d15f16492 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -80,6 +80,7 @@ void qemu_register_wakeup_notifier(Notifier *notifier);
 void qemu_system_shutdown_request(ShutdownCause reason);
 void qemu_system_powerdown_request(void);
 void qemu_register_powerdown_notifier(Notifier *notifier);
+void qemu_register_shutdown_notifier(Notifier *notifier);
 void qemu_system_debug_request(void);
 void qemu_system_vmstop_request(RunState reason);
 void qemu_system_vmstop_request_prepare(void);
diff --git a/vl.c b/vl.c
index 1fcacc5caa..d33d52522c 100644
--- a/vl.c
+++ b/vl.c
@@ -1578,6 +1578,8 @@ static NotifierList suspend_notifiers =
 NOTIFIER_LIST_INITIALIZER(suspend_notifiers);
 static NotifierList wakeup_notifiers =
 NOTIFIER_LIST_INITIALIZER(wakeup_notifiers);
+static NotifierList shutdown_notifiers =
+NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
 static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
 
 ShutdownCause qemu_shutdown_requested_get(void)
@@ -1809,6 +1811,12 @@ static void qemu_system_powerdown(void)
 notifier_list_notify(_notifiers, NULL);
 }
 
+static void qemu_system_shutdown(ShutdownCause cause)
+{
+qapi_event_send_shutdown(shutdown_caused_by_guest(cause));
+notifier_list_notify(_notifiers, );
+}
+
 void qemu_system_powerdown_request(void)
 {
 trace_qemu_system_powerdown_request();
@@ -1821,6 +1829,11 @@ void qemu_register_powerdown_notifier(Notifier *notifier)
 notifier_list_add(_notifiers, notifier);
 }
 
+void qemu_register_shutdown_notifier(Notifier *notifier)
+{
+notifier_list_add(_notifiers, notifier);
+}
+
 void qemu_system_debug_request(void)
 {
 debug_requested = 1;
@@ -1848,7 +1861,7 @@ static bool main_loop_should_exit(void)
 request = qemu_shutdown_requested();
 if (request) {
 qemu_kill_report();
-qapi_event_send_shutdown(shutdown_caused_by_guest(request));
+qemu_system_shutdown(request);
 if (no_shutdown) {
 vm_stop(RUN_STATE_SHUTDOWN);
 } else {
-- 
2.17.2




[Qemu-devel] [PATCH v3 13/23] hw/pvrdma: Make sure PCI function 0 is vmxnet3

2018-11-12 Thread Yuval Shaia
Guest driver enforces it, we should also.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma.h  | 2 ++
 hw/rdma/vmw/pvrdma_main.c | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index b019cb843a..10a3c4fb7c 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -20,6 +20,7 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
+#include "hw/net/vmxnet3_defs.h"
 
 #include "../rdma_backend_defs.h"
 #include "../rdma_rm_defs.h"
@@ -85,6 +86,7 @@ typedef struct PVRDMADev {
 RdmaBackendDev backend_dev;
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
+VMXNET3State *func0;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index ac8c092db0..fa6468d221 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -576,6 +576,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 return;
 }
 
+/* Break if not vmxnet3 device in slot 0 */
+dev->func0 = VMXNET3(pci_get_function_0(pdev));
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




[Qemu-devel] [PATCH v3 20/23] hw/pvrdma: Clean device's resource when system is shutdown

2018-11-12 Thread Yuval Shaia
In order to clean some external resources such as GIDs, QPs etc,
register to receive notification when VM is shutdown.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma.h  |  2 ++
 hw/rdma/vmw/pvrdma_main.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index 10a3c4fb7c..ffae36986e 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -17,6 +17,7 @@
 #define PVRDMA_PVRDMA_H
 
 #include "qemu/units.h"
+#include "qemu/notify.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
@@ -87,6 +88,7 @@ typedef struct PVRDMADev {
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
 VMXNET3State *func0;
+Notifier shutdown_notifier;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 95e9322b7c..45a59cddf9 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -24,6 +24,7 @@
 #include "hw/qdev-properties.h"
 #include "cpu.h"
 #include "trace.h"
+#include "sysemu/sysemu.h"
 
 #include "../rdma_rm.h"
 #include "../rdma_backend.h"
@@ -559,6 +560,14 @@ static int pvrdma_check_ram_shared(Object *obj, void 
*opaque)
 return 0;
 }
 
+static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
+{
+PVRDMADev *dev = container_of(n, PVRDMADev, shutdown_notifier);
+PCIDevice *pci_dev = PCI_DEVICE(dev);
+
+pvrdma_fini(pci_dev);
+}
+
 static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 {
 int rc;
@@ -623,6 +632,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 goto out;
 }
 
+dev->shutdown_notifier.notify = pvrdma_shutdown_notifier;
+qemu_register_shutdown_notifier(>shutdown_notifier);
+
 out:
 if (rc) {
 error_append_hint(errp, "Device fail to load\n");
-- 
2.17.2




[Qemu-devel] [PATCH v3 11/23] hw/pvrdma: Add support to allow guest to configure GID table

2018-11-12 Thread Yuval Shaia
The control over the RDMA device's GID table is done by updating the
device's Ethernet function addresses.
Usually the first GID entry is determine by the MAC address, the second
by the first IPv6 address and the third by the IPv4 address. Other
entries can be added by adding more IP addresses. The opposite is the
same, i.e. whenever an address is removed, the corresponding GID entry
is removed.

The process is done by the network and RDMA stacks. Whenever an address
is added the ib_core driver is notified and calls the device driver
add_gid function which in turn update the device.

To support this in pvrdma device we need to hook into the create_bind
and destroy_bind HW commands triggered by pvrdma driver in guest.
Whenever a changed is made to the pvrdma device's GID table a special
QMP messages is sent to be processed by libvirt to update the address of
the backend Ethernet device.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 243 +++-
 hw/rdma/rdma_backend.h  |  22 ++--
 hw/rdma/rdma_backend_defs.h |   3 +-
 hw/rdma/rdma_rm.c   | 104 ++-
 hw/rdma/rdma_rm.h   |  17 ++-
 hw/rdma/rdma_rm_defs.h  |   9 +-
 hw/rdma/rdma_utils.h|  15 +++
 hw/rdma/vmw/pvrdma.h|   2 +-
 hw/rdma/vmw/pvrdma_cmd.c|  55 
 hw/rdma/vmw/pvrdma_main.c   |  25 +---
 hw/rdma/vmw/pvrdma_qp_ops.c |  20 +++
 11 files changed, 370 insertions(+), 145 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 3eb0099f8d..5675504165 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -18,12 +18,14 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qnum.h"
+#include "qapi/qapi-events-rdma.h"
 
 #include 
 #include 
 #include 
 #include 
 
+#include "contrib/rdmacm-mux/rdmacm-mux.h"
 #include "trace.h"
 #include "rdma_utils.h"
 #include "rdma_rm.h"
@@ -300,11 +302,11 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
-static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
-uint32_t num_sge)
+static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
+union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
 {
-struct backend_umad umad = {0};
-char *hdr, *msg;
+RdmaCmMuxMsg msg = {0};
+char *hdr, *data;
 int ret;
 
 pr_dbg("num_sge=%d\n", num_sge);
@@ -313,41 +315,50 @@ static int mad_send(RdmaBackendDev *backend_dev, struct 
ibv_sge *sge,
 return -EINVAL;
 }
 
-umad.hdr.length = sge[0].length + sge[1].length;
-pr_dbg("msg_len=%d\n", umad.hdr.length);
+msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD;
+memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
 
-if (umad.hdr.length > sizeof(umad.mad)) {
+msg.umad_len = sge[0].length + sge[1].length;
+pr_dbg("umad_len=%d\n", msg.umad_len);
+
+if (msg.umad_len > sizeof(msg.umad.mad)) {
 return -ENOMEM;
 }
 
-umad.hdr.addr.qpn = htobe32(1);
-umad.hdr.addr.grh_present = 1;
-umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
-memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
-umad.hdr.addr.hop_limit = 1;
+msg.umad.hdr.addr.qpn = htobe32(1);
+msg.umad.hdr.addr.grh_present = 1;
+pr_dbg("sgid_idx=%d\n", sgid_idx);
+pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
+msg.umad.hdr.addr.gid_index = sgid_idx;
+memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
+msg.umad.hdr.addr.hop_limit = 1;
 
 hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
-msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+
+pr_dbg_buf("mad_hdr", hdr, sge[0].length);
+pr_dbg_buf("mad_data", data, sge[1].length);
 
-memcpy([0], hdr, sge[0].length);
-memcpy([sge[0].length], msg, sge[1].length);
+memcpy([0], hdr, sge[0].length);
+memcpy([sge[0].length], data, sge[1].length);
 
-rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
 
-ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
-sizeof(umad));
+ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
+sizeof(msg));
 
 pr_dbg("qemu_chr_fe_write=%d\n", ret);
 
-return (ret != sizeof(umad));
+return (ret != sizeof(msg));
 }
 
 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 RdmaBackendQP *qp, uint8_t qp_type,
 struct ibv_sge *sge, uint32_t num_sge,
-union ibv_gid *dgid, uint32_t dqpn,
-uint32_t dqkey, void *ctx)
+   

[Qemu-devel] [PATCH v3 23/23] docs: Update pvrdma device documentation

2018-11-12 Thread Yuval Shaia
Interface with the device is changed with the addition of support for
MAD packets.
Adjust documentation accordingly.

While there fix a minor mistake which may lead to think that there is a
relation between using RXE on host and the compatibility with bare-metal
peers.

Signed-off-by: Yuval Shaia 
---
 docs/pvrdma.txt | 103 +++-
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt
index 5599318159..9e8d1674b7 100644
--- a/docs/pvrdma.txt
+++ b/docs/pvrdma.txt
@@ -9,8 +9,9 @@ It works with its Linux Kernel driver AS IS, no need for any 
special guest
 modifications.
 
 While it complies with the VMware device, it can also communicate with bare
-metal RDMA-enabled machines and does not require an RDMA HCA in the host, it
-can work with Soft-RoCE (rxe).
+metal RDMA-enabled machines as peers.
+
+It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe).
 
 It does not require the whole guest RAM to be pinned allowing memory
 over-commit and, even if not implemented yet, migration support will be
@@ -78,29 +79,93 @@ the required RDMA libraries.
 
 3. Usage
 
+
+
+3.1 VM Memory settings
+==
 Currently the device is working only with memory backed RAM
 and it must be mark as "shared":
-m 1G \
-object memory-backend-ram,id=mb1,size=1G,share \
-numa node,memdev=mb1 \
 
-The pvrdma device is composed of two functions:
- - Function 0 is a vmxnet Ethernet Device which is redundant in Guest
-   but is required to pass the ibdevice GID using its MAC.
-   Examples:
- For an rxe backend using eth0 interface it will use its mac:
-   -device vmxnet3,addr=.0,multifunction=on,mac=
- For an SRIOV VF, we take the Ethernet Interface exposed by it:
-   -device vmxnet3,multifunction=on,mac=
- - Function 1 is the actual device:
-   -device 
pvrdma,addr=.1,backend-dev=,backend-gid-idx=,backend-port=
-   where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4)
- Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's 
MAC.
- The rules of conversion are part of the RoCE spec, but since manual conversion
- is not required, spotting problems is not hard:
-Example: GID: fe80::::7efe:90ff:fecb:743a
- MAC: 7c:fe:90:cb:74:3a
-Note the difference between the first byte of the MAC and the GID.
+
+3.2 MAD Multiplexer
+===
+MAD Multiplexer is a service that exposes MAD-like interface for VMs in
+order to overcome the limitation where only single entity can register with
+MAD layer to send and receive RDMA-CM MAD packets.
+
+To build rdmacm-mux run
+# make rdmacm-mux
+
+The program accepts 3 command line arguments and exposes a UNIX socket to
+be used to relay control and data messages to and from the service.
+-s unix-socket-path   Path to unix socket to listen on
+  (default /var/run/rdmacm-mux)
+-d rdma-device-name   Name of RDMA device to register with
+  (default rxe0)
+-p rdma-device-port   Port number of RDMA device to register with
+  (default 1)
+The final UNIX socket file name is a concatenation of the 3 arguments so
+for example for device name mlx5_0 and port 2 the file
+/var/run/rdmacm-mux-mlx5_0-2 will be created.
+
+Please refer to contrib/rdmacm-mux for more details.
+
+
+3.3 PCI devices settings
+
+RoCE device exposes two functions - Ethernet and RDMA.
+To support it, pvrdma device is composed of two PCI functions, an Ethernet
+device of type vmxnet3 on PCI slot 0 and a pvrdma device on PCI slot 1. The
+Ethernet function can be used for other Ethernet purposes such as IP.
+
+
+3.4 Device parameters
+=
+- netdev: Specifies the Ethernet device on host. For Soft-RoCE (rxe) this
+  would be the Ethernet device used to create it. For any other physical
+  RoCE device this would be the netdev name of the device.
+- ibdev: The IB device name on host for example rxe0, mlx5_0 etc.
+- mad-chardev: The name of the MAD multiplexer char device.
+- ibport: In case of multi-port device (such as Mellanox's HCA) this
+  specify the port to use. If not set 1 will be used.
+- dev-caps-max-mr-size: The maximum size of MR.
+- dev-caps-max-qp: Maximum number of QPs.
+- dev-caps-max-sge: Maximum number of SGE elements in WR.
+- dev-caps-max-cq: Maximum number of CQs.
+- dev-caps-max-mr: Maximum number of MRs.
+- dev-caps-max-pd: Maximum number of PDs.
+- dev-caps-max-ah: Maximum number of AHs.
+
+Notes:
+- The first 3 parameters are mandatory settings, the rest have their
+  defaults.
+- The last 8 parameters (the ones that prefixed by dev-caps) defines the top
+  limits but the final values are adjusted by the backend device limitations.
+
+3.5 Example
+===
+Define bridge device with vmxnet3 network backend:
+
+  
+  
+  
+  
+
+
+Define pvrdma device:
+
+  
+  
+  
+  
+  
+  
+  
+  
+
 
 
 
-- 
2.17.2




[Qemu-devel] [PATCH v3 15/23] hw/pvrdma: Make device state depend on Ethernet function state

2018-11-12 Thread Yuval Shaia
User should be able to control the device by changing Ethernet function
state so if user runs 'ifconfig ens3 down' the PVRDMA function should be
down as well.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_cmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 2979582fac..0d3c818c20 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -139,7 +139,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
 resp->hdr.err = 0;
 
-resp->attrs.state = attrs.state;
+resp->attrs.state = dev->func0->device_active ? attrs.state :
+PVRDMA_PORT_DOWN;
 resp->attrs.max_mtu = attrs.max_mtu;
 resp->attrs.active_mtu = attrs.active_mtu;
 resp->attrs.phys_state = attrs.phys_state;
-- 
2.17.2




[Qemu-devel] [PATCH v3 12/23] vmxnet3: Move some definitions to header file

2018-11-12 Thread Yuval Shaia
pvrdma setup requires vmxnet3 device on PCI function 0 and PVRDMA device
on PCI function 1.
pvrdma device needs to access vmxnet3 device object for several reasons:
1. Make sure PCI function 0 is vmxnet3.
2. To monitor vmxnet3 device state.
3. To configure node_guid accoring to vmxnet3 device's MAC address.

To be able to access vmxnet3 device the definition of VMXNET3State is
moved to a new header file.

Signed-off-by: Yuval Shaia 
Reviewed-by: Dmitry Fleytman 
---
 hw/net/vmxnet3.c  | 116 +---
 hw/net/vmxnet3_defs.h | 133 ++
 2 files changed, 134 insertions(+), 115 deletions(-)
 create mode 100644 hw/net/vmxnet3_defs.h

diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 3648630386..54746a4030 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -18,7 +18,6 @@
 #include "qemu/osdep.h"
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
-#include "net/net.h"
 #include "net/tap.h"
 #include "net/checksum.h"
 #include "sysemu/sysemu.h"
@@ -29,6 +28,7 @@
 #include "migration/register.h"
 
 #include "vmxnet3.h"
+#include "vmxnet3_defs.h"
 #include "vmxnet_debug.h"
 #include "vmware_utils.h"
 #include "net_tx_pkt.h"
@@ -131,23 +131,11 @@ typedef struct VMXNET3Class {
 DeviceRealize parent_dc_realize;
 } VMXNET3Class;
 
-#define TYPE_VMXNET3 "vmxnet3"
-#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3)
-
 #define VMXNET3_DEVICE_CLASS(klass) \
 OBJECT_CLASS_CHECK(VMXNET3Class, (klass), TYPE_VMXNET3)
 #define VMXNET3_DEVICE_GET_CLASS(obj) \
 OBJECT_GET_CLASS(VMXNET3Class, (obj), TYPE_VMXNET3)
 
-/* Cyclic ring abstraction */
-typedef struct {
-hwaddr pa;
-uint32_t size;
-uint32_t cell_size;
-uint32_t next;
-uint8_t gen;
-} Vmxnet3Ring;
-
 static inline void vmxnet3_ring_init(PCIDevice *d,
 Vmxnet3Ring *ring,
  hwaddr pa,
@@ -245,108 +233,6 @@ vmxnet3_dump_rx_descr(struct Vmxnet3_RxDesc *descr)
   descr->rsvd, descr->dtype, descr->ext1, descr->btype);
 }
 
-/* Device state and helper functions */
-#define VMXNET3_RX_RINGS_PER_QUEUE (2)
-
-typedef struct {
-Vmxnet3Ring tx_ring;
-Vmxnet3Ring comp_ring;
-
-uint8_t intr_idx;
-hwaddr tx_stats_pa;
-struct UPT1_TxStats txq_stats;
-} Vmxnet3TxqDescr;
-
-typedef struct {
-Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE];
-Vmxnet3Ring comp_ring;
-uint8_t intr_idx;
-hwaddr rx_stats_pa;
-struct UPT1_RxStats rxq_stats;
-} Vmxnet3RxqDescr;
-
-typedef struct {
-bool is_masked;
-bool is_pending;
-bool is_asserted;
-} Vmxnet3IntState;
-
-typedef struct {
-PCIDevice parent_obj;
-NICState *nic;
-NICConf conf;
-MemoryRegion bar0;
-MemoryRegion bar1;
-MemoryRegion msix_bar;
-
-Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES];
-Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES];
-
-/* Whether MSI-X support was installed successfully */
-bool msix_used;
-hwaddr drv_shmem;
-hwaddr temp_shared_guest_driver_memory;
-
-uint8_t txq_num;
-
-/* This boolean tells whether RX packet being indicated has to */
-/* be split into head and body chunks from different RX rings  */
-bool rx_packets_compound;
-
-bool rx_vlan_stripping;
-bool lro_supported;
-
-uint8_t rxq_num;
-
-/* Network MTU */
-uint32_t mtu;
-
-/* Maximum number of fragments for indicated TX packets */
-uint32_t max_tx_frags;
-
-/* Maximum number of fragments for indicated RX packets */
-uint16_t max_rx_frags;
-
-/* Index for events interrupt */
-uint8_t event_int_idx;
-
-/* Whether automatic interrupts masking enabled */
-bool auto_int_masking;
-
-bool peer_has_vhdr;
-
-/* TX packets to QEMU interface */
-struct NetTxPkt *tx_pkt;
-uint32_t offload_mode;
-uint32_t cso_or_gso_size;
-uint16_t tci;
-bool needs_vlan;
-
-struct NetRxPkt *rx_pkt;
-
-bool tx_sop;
-bool skip_current_tx_pkt;
-
-uint32_t device_active;
-uint32_t last_command;
-
-uint32_t link_status_and_speed;
-
-Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS];
-
-uint32_t temp_mac;   /* To store the low part first */
-
-MACAddr perm_mac;
-uint32_t vlan_table[VMXNET3_VFT_SIZE];
-uint32_t rx_mode;
-MACAddr *mcast_list;
-uint32_t mcast_list_len;
-uint32_t mcast_list_buff_size; /* needed for live migration. */
-
-/* Compatibility flags for migration */
-uint32_t compat_flags;
-} VMXNET3State;
-
 /* Interrupt management */
 
 /*
diff --git a/hw/net/vmxnet3_defs.h b/hw/net/vmxnet3_defs.h
new file mode 100644
index 00..6c19d29b12
--- /dev/null
+++ b/hw/net/vmxnet3_defs.h
@@ -0,0 +1,133 @@
+/*
+ * 

[Qemu-devel] [PATCH v3 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid

2018-11-12 Thread Yuval Shaia
When device goes down the function fini_ports loops over all entries in
gid table regardless of the fact whether entry is valid or not. In case
that entry is not valid we'd like to skip from any further processing in
backend device.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 35a96d9a64..e3f6b2f6ea 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 {
 int rc;
 
+if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) {
+return 0;
+}
+
 rc = rdma_backend_del_gid(backend_dev, ifname,
   _res->port.gid_tbl[gid_idx].gid);
 if (rc < 0) {
-- 
2.17.2




[Qemu-devel] [PATCH v3 18/23] hw/rdma: Remove unneeded code that handles more that one port

2018-11-12 Thread Yuval Shaia
Device supports only one port, let's remove a dead code that handles
more than one port.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c  | 34 --
 hw/rdma/rdma_rm.h  |  2 +-
 hw/rdma/rdma_rm_defs.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index fe0979415d..0a5ab8935a 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -545,7 +545,7 @@ int rdma_rm_add_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 return -EINVAL;
 }
 
-memcpy(_res->ports[0].gid_tbl[gid_idx].gid, gid, sizeof(*gid));
+memcpy(_res->port.gid_tbl[gid_idx].gid, gid, sizeof(*gid));
 
 return 0;
 }
@@ -556,15 +556,15 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 int rc;
 
 rc = rdma_backend_del_gid(backend_dev, ifname,
-  _res->ports[0].gid_tbl[gid_idx].gid);
+  _res->port.gid_tbl[gid_idx].gid);
 if (rc < 0) {
 pr_dbg("Fail to delete gid\n");
 return -EINVAL;
 }
 
-memset(dev_res->ports[0].gid_tbl[gid_idx].gid.raw, 0,
-   sizeof(dev_res->ports[0].gid_tbl[gid_idx].gid));
-dev_res->ports[0].gid_tbl[gid_idx].backend_gid_index = -1;
+memset(dev_res->port.gid_tbl[gid_idx].gid.raw, 0,
+   sizeof(dev_res->port.gid_tbl[gid_idx].gid));
+dev_res->port.gid_tbl[gid_idx].backend_gid_index = -1;
 
 return 0;
 }
@@ -577,16 +577,16 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 return -EINVAL;
 }
 
-if (unlikely(dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index == -1)) 
{
-dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index =
+if (unlikely(dev_res->port.gid_tbl[sgid_idx].backend_gid_index == -1)) {
+dev_res->port.gid_tbl[sgid_idx].backend_gid_index =
 rdma_backend_get_gid_index(backend_dev,
-   
_res->ports[0].gid_tbl[sgid_idx].gid);
+   _res->port.gid_tbl[sgid_idx].gid);
 }
 
 pr_dbg("backend_gid_index=%d\n",
-   dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index);
+   dev_res->port.gid_tbl[sgid_idx].backend_gid_index);
 
-return dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index;
+return dev_res->port.gid_tbl[sgid_idx].backend_gid_index;
 }
 
 static void destroy_qp_hash_key(gpointer data)
@@ -596,15 +596,13 @@ static void destroy_qp_hash_key(gpointer data)
 
 static void init_ports(RdmaDeviceResources *dev_res)
 {
-int i, j;
+int i;
 
-memset(dev_res->ports, 0, sizeof(dev_res->ports));
+memset(_res->port, 0, sizeof(dev_res->port));
 
-for (i = 0; i < MAX_PORTS; i++) {
-dev_res->ports[i].state = IBV_PORT_DOWN;
-for (j = 0; j < MAX_PORT_GIDS; j++) {
-dev_res->ports[i].gid_tbl[j].backend_gid_index = -1;
-}
+dev_res->port.state = IBV_PORT_DOWN;
+for (i = 0; i < MAX_PORT_GIDS; i++) {
+dev_res->port.gid_tbl[i].backend_gid_index = -1;
 }
 }
 
@@ -613,7 +611,7 @@ static void fini_ports(RdmaDeviceResources *dev_res,
 {
 int i;
 
-dev_res->ports[0].state = IBV_PORT_DOWN;
+dev_res->port.state = IBV_PORT_DOWN;
 for (i = 0; i < MAX_PORT_GIDS; i++) {
 rdma_rm_del_gid(dev_res, backend_dev, ifname, i);
 }
diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h
index a7169b4e89..3c602c04c0 100644
--- a/hw/rdma/rdma_rm.h
+++ b/hw/rdma/rdma_rm.h
@@ -79,7 +79,7 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res,
  int sgid_idx)
 {
-return _res->ports[0].gid_tbl[sgid_idx].gid;
+return _res->port.gid_tbl[sgid_idx].gid;
 }
 
 #endif
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7b3435f991..0ba61d1838 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -18,7 +18,7 @@
 
 #include "rdma_backend_defs.h"
 
-#define MAX_PORTS 1
+#define MAX_PORTS 1 /* Do not change - we support only one port */
 #define MAX_PORT_GIDS 255
 #define MAX_GIDS  MAX_PORT_GIDS
 #define MAX_PORT_PKEYS1
@@ -97,7 +97,7 @@ typedef struct RdmaRmPort {
 } RdmaRmPort;
 
 typedef struct RdmaDeviceResources {
-RdmaRmPort ports[MAX_PORTS];
+RdmaRmPort port;
 RdmaRmResTbl pd_tbl;
 RdmaRmResTbl mr_tbl;
 RdmaRmResTbl uc_tbl;
-- 
2.17.2




[Qemu-devel] [PATCH v3 08/23] hw/pvrdma: Set the correct opcode for recv completion

2018-11-12 Thread Yuval Shaia
The function pvrdma_post_cqe populates CQE entry with opcode from the
given completion element. For receive operation value was not set. Fix
it by setting it to IBV_WC_RECV.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 762700a205..7b0f440fda 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -196,8 +196,9 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx = g_malloc(sizeof(CompHandlerCtx));
 comp_ctx->dev = dev;
 comp_ctx->cq_handle = qp->recv_cq_handle;
-comp_ctx->cqe.qp = qp_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
+comp_ctx->cqe.qp = qp_handle;
+comp_ctx->cqe.opcode = IBV_WC_RECV;
 
 rdma_backend_post_recv(>backend_dev, >rdma_dev_res,
>backend_qp, qp->qp_type,
-- 
2.17.2




[Qemu-devel] [PATCH v3 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer

2018-11-12 Thread Yuval Shaia
RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a
given MAD class.
This does not go hand-by-hand with qemu pvrdma device's requirements
where each VM is MAD agent.
Fix it by adding implementation of RDMA MAD multiplexer service which on
one hand register as a sole MAD agent with the kernel module and on the
other hand gives service to more than one VM.

Design Overview:

A server process is registered to UMAD framework (for this to work the
rdma_cm kernel module needs to be unloaded) and creates a unix socket to
listen to incoming request from clients.
A client process (such as QEMU) connects to this unix socket and
registers with its own GID.

TX:
---
When client needs to send rdma_cm MAD message it construct it the same
way as without this multiplexer, i.e. creates a umad packet but this
time it writes its content to the socket instead of calling umad_send().
The server, upon receiving such a message fetch local_comm_id from it so
a context for this session can be maintain and relay the message to UMAD
layer by calling umad_send().

RX:
---
The server creates a worker thread to process incoming rdma_cm MAD
messages. When an incoming message arrived (umad_recv()) the server,
depending on the message type (attr_id) looks for target client by
either searching in gid->fd table or in local_comm_id->fd table. With
the extracted fd the server relays to incoming message to the client.

Signed-off-by: Yuval Shaia 
---
 MAINTAINERS  |   1 +
 Makefile |   3 +
 Makefile.objs|   1 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 771 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 6 files changed, 836 insertions(+)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 100644 contrib/rdmacm-mux/main.c
 create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 98a1856afc..e087d58ac6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2231,6 +2231,7 @@ S: Maintained
 F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
+F: contrib/rdmacm-mux/*
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index f2947186a4..94072776ff 100644
--- a/Makefile
+++ b/Makefile
@@ -418,6 +418,7 @@ dummy := $(call unnest-vars,, \
 elf2dmp-obj-y \
 ivshmem-client-obj-y \
 ivshmem-server-obj-y \
+rdmacm-mux-obj-y \
 libvhost-user-obj-y \
 vhost-user-scsi-obj-y \
 vhost-user-blk-obj-y \
@@ -725,6 +726,8 @@ vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) 
libvhost-user.a
$(call LINK, $^)
 vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
$(call LINK, $^)
+rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS)
+   $(call LINK, $^)
 
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
$(call quiet-command,$(PYTHON) $< $@ \
diff --git a/Makefile.objs b/Makefile.objs
index 1e1ff387d7..cc7df3ad80 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -194,6 +194,7 @@ vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
 vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
 vhost-user-blk-obj-y = contrib/vhost-user-blk/
+rdmacm-mux-obj-y = contrib/rdmacm-mux/
 
 ##
 trace-events-subdirs =
diff --git a/contrib/rdmacm-mux/Makefile.objs b/contrib/rdmacm-mux/Makefile.objs
new file mode 100644
index 00..be3eacb6f7
--- /dev/null
+++ b/contrib/rdmacm-mux/Makefile.objs
@@ -0,0 +1,4 @@
+ifdef CONFIG_PVRDMA
+CFLAGS += -libumad -Wno-format-truncation
+rdmacm-mux-obj-y = main.o
+endif
diff --git a/contrib/rdmacm-mux/main.c b/contrib/rdmacm-mux/main.c
new file mode 100644
index 00..47cf0ac7bc
--- /dev/null
+++ b/contrib/rdmacm-mux/main.c
@@ -0,0 +1,771 @@
+/*
+ * QEMU paravirtual RDMA - rdmacm-mux implementation
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ * Yuval Shaia 
+ * Marcel Apfelbaum 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sys/poll.h"
+#include "sys/ioctl.h"
+#include "pthread.h"
+#include "syslog.h"
+
+#include "infiniband/verbs.h"
+#include "infiniband/umad.h"
+#include "infiniband/umad_types.h"
+#include "infiniband/umad_sa.h"
+#include "infiniband/umad_cm.h"
+
+#include "rdmacm-mux.h"
+
+#define SCALE_US 1000
+#define COMMID_TTL 2 /* How many SCALE_US a context of MAD session is saved */
+#define SLEEP_SECS 5 /* This is used both in poll() and thread */
+#define SERVER_LISTEN_BACKLOG 10
+#define MAX_CLIENTS 4096
+#define MAD_RMPP_VERSION 0
+#define MAD_METHOD_MASK0 0x8
+
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * 

[Qemu-devel] [PATCH v3 17/23] hw/pvrdma: Fill error code in command's response

2018-11-12 Thread Yuval Shaia
Driver checks error code let's set it.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_cmd.c | 67 
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 0d3c818c20..a326c5d470 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 if (rdma_backend_query_port(>backend_dev,
 (struct ibv_port_attr *))) {
-return -ENOMEM;
+resp->hdr.err = -ENOMEM;
+goto out;
 }
 
 memset(resp, 0, sizeof(*resp));
@@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->attrs.active_width = 1;
 resp->attrs.active_speed = 1;
 
-return 0;
+out:
+pr_dbg("ret=%d\n", resp->hdr.err);
+return resp->hdr.err;
 }
 
 static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->pkey = PVRDMA_PKEY;
 pr_dbg("pkey=0x%x\n", resp->pkey);
 
-return 0;
+return resp->hdr.err;
 }
 
 static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
@@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle);
 if (!cq) {
 pr_dbg("Invalid CQ handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 ring = (PvrdmaRing *)cq->opaque;
@@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
@@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle);
 if (!qp) {
 pr_dbg("Invalid QP handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle);
@@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE);
 g_free(ring);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
@@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev,
  dev->backend_eth_device_name, gid, cmd->index);
 if (rc < 0) {
-return -EINVAL;
+rsp->hdr.err = rc;
+goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -606,7 +629,8 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rc = rdma_rm_del_gid(>rdma_dev_res, >backend_dev,
@@ -617,7 +641,11 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -634,9 +662,8 @@ static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.err = rdma_rm_alloc_uc(>rdma_dev_res, cmd->pfn,
  >ctx_handle);
 
-pr_dbg("ret=%d\n", resp->hdr.err);
-
-return 0;
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_uc(PVRDMADev *dev, union 

[Qemu-devel] [PATCH v3 04/23] hw/rdma: Abort send-op if fail to create addr handler

2018-11-12 Thread Yuval Shaia
Function create_ah might return NULL, let's exit with an error.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/rdma_backend.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index d7a4bbd91f..1e148398a2 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -338,6 +338,10 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd,
 backend_dev->backend_gid_idx, dgid);
+if (!wr.wr.ud.ah) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+goto out_dealloc_cqe_ctx;
+}
 wr.wr.ud.remote_qpn = dqpn;
 wr.wr.ud.remote_qkey = dqkey;
 }
-- 
2.17.2




[Qemu-devel] [PATCH v3 06/23] hw/pvrdma: Make function reset_device return void

2018-11-12 Thread Yuval Shaia
This function cannot fail - fix it to return void

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 6c8c0154fa..fc2abd34af 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -369,13 +369,11 @@ static int unquiesce_device(PVRDMADev *dev)
 return 0;
 }
 
-static int reset_device(PVRDMADev *dev)
+static void reset_device(PVRDMADev *dev)
 {
 pvrdma_stop(dev);
 
 pr_dbg("Device reset complete\n");
-
-return 0;
 }
 
 static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size)
-- 
2.17.2




[Qemu-devel] [PATCH v3 05/23] hw/rdma: Add support for MAD packets

2018-11-12 Thread Yuval Shaia
MAD (Management Datagram) packets are widely used by various modules
both in kernel and in user space for example the rdma_* API which is
used to create and maintain "connection" layer on top of RDMA uses
several types of MAD packets.
To support MAD packets the device uses an external utility
(contrib/rdmacm-mux) to relay packets from and to the guest driver.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 263 +++-
 hw/rdma/rdma_backend.h  |   4 +-
 hw/rdma/rdma_backend_defs.h |  10 +-
 hw/rdma/vmw/pvrdma.h|   2 +
 hw/rdma/vmw/pvrdma_main.c   |   4 +-
 5 files changed, 273 insertions(+), 10 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 1e148398a2..3eb0099f8d 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -16,8 +16,13 @@
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qnum.h"
 
 #include 
+#include 
+#include 
+#include 
 
 #include "trace.h"
 #include "rdma_utils.h"
@@ -33,16 +38,25 @@
 #define VENDOR_ERR_MAD_SEND 0x206
 #define VENDOR_ERR_INVLKEY  0x207
 #define VENDOR_ERR_MR_SMALL 0x208
+#define VENDOR_ERR_INV_MAD_BUFF 0x209
+#define VENDOR_ERR_INV_NUM_SGE  0x210
 
 #define THR_NAME_LEN 16
 #define THR_POLL_TO  5000
 
+#define MAD_HDR_SIZE sizeof(struct ibv_grh)
+
 typedef struct BackendCtx {
-uint64_t req_id;
 void *up_ctx;
 bool is_tx_req;
+struct ibv_sge sge; /* Used to save MAD recv buffer */
 } BackendCtx;
 
+struct backend_umad {
+struct ib_user_mad hdr;
+char mad[RDMA_MAX_PRIVATE_DATA];
+};
+
 static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
 
 static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
@@ -286,6 +300,49 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
+static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
+uint32_t num_sge)
+{
+struct backend_umad umad = {0};
+char *hdr, *msg;
+int ret;
+
+pr_dbg("num_sge=%d\n", num_sge);
+
+if (num_sge != 2) {
+return -EINVAL;
+}
+
+umad.hdr.length = sge[0].length + sge[1].length;
+pr_dbg("msg_len=%d\n", umad.hdr.length);
+
+if (umad.hdr.length > sizeof(umad.mad)) {
+return -ENOMEM;
+}
+
+umad.hdr.addr.qpn = htobe32(1);
+umad.hdr.addr.grh_present = 1;
+umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
+memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
+umad.hdr.addr.hop_limit = 1;
+
+hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
+msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+
+memcpy([0], hdr, sge[0].length);
+memcpy([sge[0].length], msg, sge[1].length);
+
+rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+
+ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
+sizeof(umad));
+
+pr_dbg("qemu_chr_fe_write=%d\n", ret);
+
+return (ret != sizeof(umad));
+}
+
 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 RdmaBackendQP *qp, uint8_t qp_type,
 struct ibv_sge *sge, uint32_t num_sge,
@@ -304,9 +361,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+rc = mad_send(backend_dev, sge, num_sge);
+if (rc) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+} else {
+comp_handler(IBV_WC_SUCCESS, 0, ctx);
+}
 }
-pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
 return;
 }
 
@@ -370,6 +431,48 @@ out_free_bctx:
 g_free(bctx);
 }
 
+static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
+ struct ibv_sge *sge, uint32_t num_sge,
+ void *ctx)
+{
+BackendCtx *bctx;
+int rc;
+uint32_t bctx_id;
+
+if (num_sge != 1) {
+pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
+return VENDOR_ERR_INV_NUM_SGE;
+}
+
+if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
+pr_dbg("Too small buffer for MAD\n");
+return VENDOR_ERR_INV_MAD_BUFF;
+}
+
+pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
+pr_dbg("length=%d\n", sge[0].length);
+pr_dbg("lkey=%d\n", sge[0].lkey);
+
+bctx = g_malloc0(sizeof(*bctx));
+
+rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx);
+if (unlikely(rc)) {
+

[Qemu-devel] [PATCH v3 00/23] Add support for RDMA MAD

2018-11-12 Thread Yuval Shaia
Hi all.

This is a major enhancement to the pvrdma device to allow it to work with
state of the art applications such as MPI.

As described in patch #5, MAD packets are management packets that are used
for many purposes including but not limited to communication layer above IB
verbs API.

Patch 1 exposes new external executable (under contrib) that aims to
address a specific limitation in the RDMA usrespace MAD stack.

This patch-set mainly present MAD enhancement but during the work on it i
came across some bugs and enhancement needed to be implemented before doing
any MAD coding. This is the role of patches 2 to 4, 7 to 9 and 15 to 17.

Patches 6 and 18 are cosmetic changes while not relevant to this patchset
still introduce with it since (at least for 6) hard to decouple.

Patches 12 to 15 couple pvrdma device with vmxnet3 device as this is the
configuration enforced by pvrdma driver in guest - a vmxnet3 device in
function 0 and pvrdma device in function 1 in the same PCI slot. Patch 12
moves needed code from vmxnet3 device to a new header file that can be used
by pvrdma code while Patches 13 to 15 use of it.

Along with this patch-set there is a parallel patch posted to libvirt to
apply the change needed there as part of the process implemented in patches
10 and 11. This change is needed so that guest would be able to configure
any IP to the Ethernet function of the pvrdma device.
https://www.redhat.com/archives/libvir-list/2018-November/msg00135.html

Since we maintain external resources such as GIDs on host GID table we need
to do some cleanup before going down. This is the job of patches 19 and 20.
Patches 20 and 21 contain a fixes for bugs detected during the work on
processing cleanup code during shutdown.

v1 -> v2:
* Fix compilation issue detected when compiling for mingw
* Address comment from Eric Blake re version of QEMU in json
  message
* Fix example from QMP message in json file
* Fix case where a VM tries to remove an invalid GID from GID table
* rdmacm-mux: Cleanup entries in socket-gids table when socket is
  closed
* Cleanup resources (GIDs, QPs etc) when VM goes down

v2 -> v3:
* Address comment from Cornelia Huck for patch #19
* Add some R-Bs from Marcel Apfelbaum and Dmitry Fleytman
* Update docs/pvrdma.txt with the changes made by this patchset
* Address comments from Shamir Rabinovitch for UMAD multiplexer

Thanks,
Yuval

Yuval Shaia (23):
  contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer
  hw/rdma: Add ability to force notification without re-arm
  hw/rdma: Return qpn 1 if ibqp is NULL
  hw/rdma: Abort send-op if fail to create addr handler
  hw/rdma: Add support for MAD packets
  hw/pvrdma: Make function reset_device return void
  hw/pvrdma: Make default pkey 0x
  hw/pvrdma: Set the correct opcode for recv completion
  hw/pvrdma: Set the correct opcode for send completion
  json: Define new QMP message for pvrdma
  hw/pvrdma: Add support to allow guest to configure GID table
  vmxnet3: Move some definitions to header file
  hw/pvrdma: Make sure PCI function 0 is vmxnet3
  hw/rdma: Initialize node_guid from vmxnet3 mac address
  hw/pvrdma: Make device state depend on Ethernet function state
  hw/pvrdma: Fill all CQE fields
  hw/pvrdma: Fill error code in command's response
  hw/rdma: Remove unneeded code that handles more that one port
  vl: Introduce shutdown_notifiers
  hw/pvrdma: Clean device's resource when system is shutdown
  hw/rdma: Do not use bitmap_zero_extend to free bitmap
  hw/rdma: Do not call rdma_backend_del_gid on an empty gid
  docs: Update pvrdma device documentation

 MAINTAINERS  |   2 +
 Makefile |   6 +-
 Makefile.objs|   5 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 771 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 docs/pvrdma.txt  | 103 -
 hw/net/vmxnet3.c | 116 +
 hw/net/vmxnet3_defs.h| 133 ++
 hw/rdma/rdma_backend.c   | 461 +++---
 hw/rdma/rdma_backend.h   |  28 +-
 hw/rdma/rdma_backend_defs.h  |  13 +-
 hw/rdma/rdma_rm.c| 120 -
 hw/rdma/rdma_rm.h|  17 +-
 hw/rdma/rdma_rm_defs.h   |  21 +-
 hw/rdma/rdma_utils.h |  24 +
 hw/rdma/vmw/pvrdma.h |  10 +-
 hw/rdma/vmw/pvrdma_cmd.c | 119 +++--
 hw/rdma/vmw/pvrdma_main.c|  49 +-
 hw/rdma/vmw/pvrdma_qp_ops.c  |  62 ++-
 include/sysemu/sysemu.h  |   1 +
 qapi/qapi-schema.json|   1 +
 qapi/rdma.json   |  38 ++
 vl.c |  15 +-
 24 files changed, 1868 insertions(+), 307 deletions(-)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 100644 contrib/rdmacm-mux/main.c
 create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h
 create mode 100644 hw/net/vmxnet3_defs.h
 create 

[Qemu-devel] [PATCH v3 16/23] hw/pvrdma: Fill all CQE fields

2018-11-12 Thread Yuval Shaia
Add ability to pass specific WC attributes to CQE such as GRH_BIT flag.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 59 +++--
 hw/rdma/rdma_backend.h  |  4 +--
 hw/rdma/vmw/pvrdma_qp_ops.c | 31 +++
 3 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 5675504165..e453bda8f9 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -59,13 +59,24 @@ struct backend_umad {
 char mad[RDMA_MAX_PRIVATE_DATA];
 };
 
-static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
+static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
 
-static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
+static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
 {
 pr_err("No completion handler is registered\n");
 }
 
+static inline void complete_work(enum ibv_wc_status status, uint32_t 
vendor_err,
+ void *ctx)
+{
+struct ibv_wc wc = {0};
+
+wc.status = status;
+wc.vendor_err = vendor_err;
+
+comp_handler(ctx, );
+}
+
 static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
 {
 int i, ne;
@@ -90,7 +101,7 @@ static void poll_cq(RdmaDeviceResources *rdma_dev_res, 
struct ibv_cq *ibcq)
 }
 pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv");
 
-comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx);
+comp_handler(bctx->up_ctx, [i]);
 
 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
 g_free(bctx);
@@ -184,8 +195,8 @@ static void start_comp_thread(RdmaBackendDev *backend_dev)
comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
 }
 
-void rdma_backend_register_comp_handler(void (*handler)(int status,
-unsigned int vendor_err, void *ctx))
+void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
+ struct ibv_wc *wc))
 {
 comp_handler = handler;
 }
@@ -369,14 +380,14 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
 if (qp_type == IBV_QPT_SMI) {
 pr_dbg("QP0 unsupported\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
 rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
 if (rc) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
 } else {
-comp_handler(IBV_WC_SUCCESS, 0, ctx);
+complete_work(IBV_WC_SUCCESS, 0, ctx);
 }
 }
 return;
@@ -385,7 +396,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 pr_dbg("num_sge=%d\n", num_sge);
 if (!num_sge) {
 pr_dbg("num_sge=0\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
 return;
 }
 
@@ -396,21 +407,21 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx);
 if (unlikely(rc)) {
 pr_dbg("Failed to allocate cqe_ctx\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
 goto out_free_bctx;
 }
 
 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, 
num_sge);
 if (rc) {
 pr_dbg("Error: Failed to build host SGE array\n");
-comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
+complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
 if (!wr.wr.ud.ah) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 wr.wr.ud.remote_qpn = dqpn;
@@ -428,7 +439,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (rc) {
 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno,
 qp->ibqp->qp_num);
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
@@ -497,13 +508,13 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
   

[Qemu-devel] [PATCH v3 02/23] hw/rdma: Add ability to force notification without re-arm

2018-11-12 Thread Yuval Shaia
Upon completion of incoming packet the device pushes CQE to driver's RX
ring and notify the driver (msix).
While for data-path incoming packets the driver needs the ability to
control whether it wished to receive interrupts or not, for control-path
packets such as incoming MAD the driver needs to be notified anyway, it
even do not need to re-arm the notification bit.

Enhance the notification field to support this.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/rdma_rm.c   | 12 ++--
 hw/rdma/rdma_rm_defs.h  |  8 +++-
 hw/rdma/vmw/pvrdma_qp_ops.c |  6 --
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 8d59a42cd1..4f10fcabcc 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -263,7 +263,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 }
 
 cq->opaque = opaque;
-cq->notify = false;
+cq->notify = CNT_CLEAR;
 
 rc = rdma_backend_create_cq(backend_dev, >backend_cq, cqe);
 if (rc) {
@@ -291,7 +291,10 @@ void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, 
uint32_t cq_handle,
 return;
 }
 
-cq->notify = notify;
+if (cq->notify != CNT_SET) {
+cq->notify = notify ? CNT_ARM : CNT_CLEAR;
+}
+
 pr_dbg("notify=%d\n", cq->notify);
 }
 
@@ -349,6 +352,11 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, 
uint32_t pd_handle,
 return -EINVAL;
 }
 
+if (qp_type == IBV_QPT_GSI) {
+scq->notify = CNT_SET;
+rcq->notify = CNT_SET;
+}
+
 qp = res_tbl_alloc(_res->qp_tbl, _qpn);
 if (!qp) {
 return -ENOMEM;
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7228151239..9b399063d3 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -49,10 +49,16 @@ typedef struct RdmaRmPD {
 uint32_t ctx_handle;
 } RdmaRmPD;
 
+typedef enum CQNotificationType {
+CNT_CLEAR,
+CNT_ARM,
+CNT_SET,
+} CQNotificationType;
+
 typedef struct RdmaRmCQ {
 RdmaBackendCQ backend_cq;
 void *opaque;
-bool notify;
+CQNotificationType notify;
 } RdmaRmCQ;
 
 /* MR (DMA region) */
diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index c668afd0ed..762700a205 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -89,8 +89,10 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t 
cq_handle,
 pvrdma_ring_write_inc(>dsr_info.cq);
 
 pr_dbg("cq->notify=%d\n", cq->notify);
-if (cq->notify) {
-cq->notify = false;
+if (cq->notify != CNT_CLEAR) {
+if (cq->notify == CNT_ARM) {
+cq->notify = CNT_CLEAR;
+}
 post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
 }
 
-- 
2.17.2




[Qemu-devel] [PATCH v3 03/23] hw/rdma: Return qpn 1 if ibqp is NULL

2018-11-12 Thread Yuval Shaia
Device is not supporting QP0, only QP1.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
index 86e8fe8ab6..3ccc9a2494 100644
--- a/hw/rdma/rdma_backend.h
+++ b/hw/rdma/rdma_backend.h
@@ -33,7 +33,7 @@ static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev 
*dev)
 
 static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp)
 {
-return qp->ibqp ? qp->ibqp->qp_num : 0;
+return qp->ibqp ? qp->ibqp->qp_num : 1;
 }
 
 static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr)
-- 
2.17.2




[Qemu-devel] [PATCH v3 10/23] json: Define new QMP message for pvrdma

2018-11-12 Thread Yuval Shaia
pvrdma requires that the same GID attached to it will be attached to the
backend device in the host.

A new QMP messages is defined so pvrdma device can broadcast any change
made to its GID table. This event is captured by libvirt which in turn
will update the GID table in the backend device.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 MAINTAINERS   |  1 +
 Makefile  |  3 ++-
 Makefile.objs |  4 
 qapi/qapi-schema.json |  1 +
 qapi/rdma.json| 38 ++
 5 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 qapi/rdma.json

diff --git a/MAINTAINERS b/MAINTAINERS
index e087d58ac6..a149f68a8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2232,6 +2232,7 @@ F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
 F: contrib/rdmacm-mux/*
+F: qapi/rdma.json
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index 94072776ff..db4ce60ee5 100644
--- a/Makefile
+++ b/Makefile
@@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json 
$(SRC_PATH)/qapi/common.json \
$(SRC_PATH)/qapi/tpm.json \
$(SRC_PATH)/qapi/trace.json \
$(SRC_PATH)/qapi/transaction.json \
-   $(SRC_PATH)/qapi/ui.json
+   $(SRC_PATH)/qapi/ui.json \
+   $(SRC_PATH)/qapi/rdma.json
 
 qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \
 qapi/qapi-types.c qapi/qapi-types.h \
diff --git a/Makefile.objs b/Makefile.objs
index cc7df3ad80..76d8028f2f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o
 util-obj-y += qapi/qapi-types-trace.o
 util-obj-y += qapi/qapi-types-transaction.o
 util-obj-y += qapi/qapi-types-ui.o
+util-obj-y += qapi/qapi-types-rdma.o
 util-obj-y += qapi/qapi-builtin-visit.o
 util-obj-y += qapi/qapi-visit.o
 util-obj-y += qapi/qapi-visit-block-core.o
@@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o
 util-obj-y += qapi/qapi-visit-trace.o
 util-obj-y += qapi/qapi-visit-transaction.o
 util-obj-y += qapi/qapi-visit-ui.o
+util-obj-y += qapi/qapi-visit-rdma.o
 util-obj-y += qapi/qapi-events.o
 util-obj-y += qapi/qapi-events-block-core.o
 util-obj-y += qapi/qapi-events-block.o
@@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o
 util-obj-y += qapi/qapi-events-trace.o
 util-obj-y += qapi/qapi-events-transaction.o
 util-obj-y += qapi/qapi-events-ui.o
+util-obj-y += qapi/qapi-events-rdma.o
 util-obj-y += qapi/qapi-introspect.o
 
 chardev-obj-y = chardev/
@@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o
 common-obj-y += qapi/qapi-commands-trace.o
 common-obj-y += qapi/qapi-commands-transaction.o
 common-obj-y += qapi/qapi-commands-ui.o
+common-obj-y += qapi/qapi-commands-rdma.o
 common-obj-y += qapi/qapi-introspect.o
 common-obj-y += qmp.o hmp.o
 endif
diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
index 65b6dc2f6f..a650d80f83 100644
--- a/qapi/qapi-schema.json
+++ b/qapi/qapi-schema.json
@@ -94,3 +94,4 @@
 { 'include': 'trace.json' }
 { 'include': 'introspect.json' }
 { 'include': 'misc.json' }
+{ 'include': 'rdma.json' }
diff --git a/qapi/rdma.json b/qapi/rdma.json
new file mode 100644
index 00..804c68ab36
--- /dev/null
+++ b/qapi/rdma.json
@@ -0,0 +1,38 @@
+# -*- Mode: Python -*-
+#
+
+##
+# = RDMA device
+##
+
+##
+# @RDMA_GID_STATUS_CHANGED:
+#
+# Emitted when guest driver adds/deletes GID to/from device
+#
+# @netdev: RoCE Network Device name - char *
+#
+# @gid-status: Add or delete indication - bool
+#
+# @subnet-prefix: Subnet Prefix - uint64
+#
+# @interface-id : Interface ID - uint64
+#
+# Since: 3.2
+#
+# Example:
+#
+# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760},
+# "event": "RDMA_GID_STATUS_CHANGED",
+# "data":
+# {"netdev": "bridge0",
+# "interface-id": 15880512517475447892,
+# "gid-status": true,
+# "subnet-prefix": 33022}}
+#
+##
+{ 'event': 'RDMA_GID_STATUS_CHANGED',
+  'data': { 'netdev': 'str',
+'gid-status': 'bool',
+'subnet-prefix' : 'uint64',
+'interface-id'  : 'uint64' } }
-- 
2.17.2




[Qemu-devel] [PATCH v3 22/23] hw/rdma: Do not call rdma_backend_del_gid on an empty gid

2018-11-12 Thread Yuval Shaia
When device goes down the function fini_ports loops over all entries in
gid table regardless of the fact whether entry is valid or not. In case
that entry is not valid we'd like to skip from any further processing in
backend device.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 35a96d9a64..e3f6b2f6ea 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -555,6 +555,10 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 {
 int rc;
 
+if (!dev_res->port.gid_tbl[gid_idx].gid.global.interface_id) {
+return 0;
+}
+
 rc = rdma_backend_del_gid(backend_dev, ifname,
   _res->port.gid_tbl[gid_idx].gid);
 if (rc < 0) {
-- 
2.17.2




[Qemu-devel] [PATCH v3 23/23] docs: Update pvrdma device documentation

2018-11-12 Thread Yuval Shaia
Interface with the device is changed with the addition of support for
MAD packets.
Adjust documentation accordingly.

While there fix a minor mistake which may lead to think that there is a
relation between using RXE on host and the compatibility with bare-metal
peers.

Signed-off-by: Yuval Shaia 
---
 docs/pvrdma.txt | 103 +++-
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/docs/pvrdma.txt b/docs/pvrdma.txt
index 5599318159..9e8d1674b7 100644
--- a/docs/pvrdma.txt
+++ b/docs/pvrdma.txt
@@ -9,8 +9,9 @@ It works with its Linux Kernel driver AS IS, no need for any 
special guest
 modifications.
 
 While it complies with the VMware device, it can also communicate with bare
-metal RDMA-enabled machines and does not require an RDMA HCA in the host, it
-can work with Soft-RoCE (rxe).
+metal RDMA-enabled machines as peers.
+
+It does not require an RDMA HCA in the host, it can work with Soft-RoCE (rxe).
 
 It does not require the whole guest RAM to be pinned allowing memory
 over-commit and, even if not implemented yet, migration support will be
@@ -78,29 +79,93 @@ the required RDMA libraries.
 
 3. Usage
 
+
+
+3.1 VM Memory settings
+==
 Currently the device is working only with memory backed RAM
 and it must be mark as "shared":
-m 1G \
-object memory-backend-ram,id=mb1,size=1G,share \
-numa node,memdev=mb1 \
 
-The pvrdma device is composed of two functions:
- - Function 0 is a vmxnet Ethernet Device which is redundant in Guest
-   but is required to pass the ibdevice GID using its MAC.
-   Examples:
- For an rxe backend using eth0 interface it will use its mac:
-   -device vmxnet3,addr=.0,multifunction=on,mac=
- For an SRIOV VF, we take the Ethernet Interface exposed by it:
-   -device vmxnet3,multifunction=on,mac=
- - Function 1 is the actual device:
-   -device 
pvrdma,addr=.1,backend-dev=,backend-gid-idx=,backend-port=
-   where the ibdevice can be rxe or RDMA VF (e.g. mlx5_4)
- Note: Pay special attention that the GID at backend-gid-idx matches vmxnet's 
MAC.
- The rules of conversion are part of the RoCE spec, but since manual conversion
- is not required, spotting problems is not hard:
-Example: GID: fe80::::7efe:90ff:fecb:743a
- MAC: 7c:fe:90:cb:74:3a
-Note the difference between the first byte of the MAC and the GID.
+
+3.2 MAD Multiplexer
+===
+MAD Multiplexer is a service that exposes MAD-like interface for VMs in
+order to overcome the limitation where only single entity can register with
+MAD layer to send and receive RDMA-CM MAD packets.
+
+To build rdmacm-mux run
+# make rdmacm-mux
+
+The program accepts 3 command line arguments and exposes a UNIX socket to
+be used to relay control and data messages to and from the service.
+-s unix-socket-path   Path to unix socket to listen on
+  (default /var/run/rdmacm-mux)
+-d rdma-device-name   Name of RDMA device to register with
+  (default rxe0)
+-p rdma-device-port   Port number of RDMA device to register with
+  (default 1)
+The final UNIX socket file name is a concatenation of the 3 arguments so
+for example for device name mlx5_0 and port 2 the file
+/var/run/rdmacm-mux-mlx5_0-2 will be created.
+
+Please refer to contrib/rdmacm-mux for more details.
+
+
+3.3 PCI devices settings
+
+RoCE device exposes two functions - Ethernet and RDMA.
+To support it, pvrdma device is composed of two PCI functions, an Ethernet
+device of type vmxnet3 on PCI slot 0 and a pvrdma device on PCI slot 1. The
+Ethernet function can be used for other Ethernet purposes such as IP.
+
+
+3.4 Device parameters
+=
+- netdev: Specifies the Ethernet device on host. For Soft-RoCE (rxe) this
+  would be the Ethernet device used to create it. For any other physical
+  RoCE device this would be the netdev name of the device.
+- ibdev: The IB device name on host for example rxe0, mlx5_0 etc.
+- mad-chardev: The name of the MAD multiplexer char device.
+- ibport: In case of multi-port device (such as Mellanox's HCA) this
+  specify the port to use. If not set 1 will be used.
+- dev-caps-max-mr-size: The maximum size of MR.
+- dev-caps-max-qp: Maximum number of QPs.
+- dev-caps-max-sge: Maximum number of SGE elements in WR.
+- dev-caps-max-cq: Maximum number of CQs.
+- dev-caps-max-mr: Maximum number of MRs.
+- dev-caps-max-pd: Maximum number of PDs.
+- dev-caps-max-ah: Maximum number of AHs.
+
+Notes:
+- The first 3 parameters are mandatory settings, the rest have their
+  defaults.
+- The last 8 parameters (the ones that prefixed by dev-caps) defines the top
+  limits but the final values are adjusted by the backend device limitations.
+
+3.5 Example
+===
+Define bridge device with vmxnet3 network backend:
+
+  
+  
+  
+  
+
+
+Define pvrdma device:
+
+  
+  
+  
+  
+  
+  
+  
+  
+
 
 
 
-- 
2.17.2




[Qemu-devel] [PATCH v3 09/23] hw/pvrdma: Set the correct opcode for send completion

2018-11-12 Thread Yuval Shaia
opcode for WC should be set by the device and not taken from work
element.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 7b0f440fda..3388be1926 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -154,7 +154,7 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx->cq_handle = qp->send_cq_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
 comp_ctx->cqe.qp = qp_handle;
-comp_ctx->cqe.opcode = wqe->hdr.opcode;
+comp_ctx->cqe.opcode = IBV_WC_SEND;
 
 rdma_backend_post_send(>backend_dev, >backend_qp, qp->qp_type,
(struct ibv_sge *)>sge[0], 
wqe->hdr.num_sge,
-- 
2.17.2




[Qemu-devel] [PATCH v3 20/23] hw/pvrdma: Clean device's resource when system is shutdown

2018-11-12 Thread Yuval Shaia
In order to clean some external resources such as GIDs, QPs etc,
register to receive notification when VM is shutdown.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma.h  |  2 ++
 hw/rdma/vmw/pvrdma_main.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index 10a3c4fb7c..ffae36986e 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -17,6 +17,7 @@
 #define PVRDMA_PVRDMA_H
 
 #include "qemu/units.h"
+#include "qemu/notify.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
@@ -87,6 +88,7 @@ typedef struct PVRDMADev {
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
 VMXNET3State *func0;
+Notifier shutdown_notifier;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 95e9322b7c..45a59cddf9 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -24,6 +24,7 @@
 #include "hw/qdev-properties.h"
 #include "cpu.h"
 #include "trace.h"
+#include "sysemu/sysemu.h"
 
 #include "../rdma_rm.h"
 #include "../rdma_backend.h"
@@ -559,6 +560,14 @@ static int pvrdma_check_ram_shared(Object *obj, void 
*opaque)
 return 0;
 }
 
+static void pvrdma_shutdown_notifier(Notifier *n, void *opaque)
+{
+PVRDMADev *dev = container_of(n, PVRDMADev, shutdown_notifier);
+PCIDevice *pci_dev = PCI_DEVICE(dev);
+
+pvrdma_fini(pci_dev);
+}
+
 static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 {
 int rc;
@@ -623,6 +632,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 goto out;
 }
 
+dev->shutdown_notifier.notify = pvrdma_shutdown_notifier;
+qemu_register_shutdown_notifier(>shutdown_notifier);
+
 out:
 if (rc) {
 error_append_hint(errp, "Device fail to load\n");
-- 
2.17.2




[Qemu-devel] [PATCH v3 21/23] hw/rdma: Do not use bitmap_zero_extend to free bitmap

2018-11-12 Thread Yuval Shaia
bitmap_zero_extend is designed to work for extending, not for
shrinking.
Using g_free instead.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 0a5ab8935a..35a96d9a64 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -43,7 +43,7 @@ static inline void res_tbl_free(RdmaRmResTbl *tbl)
 {
 qemu_mutex_destroy(>lock);
 g_free(tbl->tbl);
-bitmap_zero_extend(tbl->bitmap, tbl->tbl_sz, 0);
+g_free(tbl->bitmap);
 }
 
 static inline void *res_tbl_get(RdmaRmResTbl *tbl, uint32_t handle)
-- 
2.17.2




[Qemu-devel] [PATCH v3 13/23] hw/pvrdma: Make sure PCI function 0 is vmxnet3

2018-11-12 Thread Yuval Shaia
Guest driver enforces it, we should also.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma.h  | 2 ++
 hw/rdma/vmw/pvrdma_main.c | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index b019cb843a..10a3c4fb7c 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -20,6 +20,7 @@
 #include "hw/pci/pci.h"
 #include "hw/pci/msix.h"
 #include "chardev/char-fe.h"
+#include "hw/net/vmxnet3_defs.h"
 
 #include "../rdma_backend_defs.h"
 #include "../rdma_rm_defs.h"
@@ -85,6 +86,7 @@ typedef struct PVRDMADev {
 RdmaBackendDev backend_dev;
 RdmaDeviceResources rdma_dev_res;
 CharBackend mad_chr;
+VMXNET3State *func0;
 } PVRDMADev;
 #define PVRDMA_DEV(dev) OBJECT_CHECK(PVRDMADev, (dev), PVRDMA_HW_NAME)
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index ac8c092db0..fa6468d221 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -576,6 +576,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 return;
 }
 
+/* Break if not vmxnet3 device in slot 0 */
+dev->func0 = VMXNET3(pci_get_function_0(pdev));
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




[Qemu-devel] [PATCH v3 07/23] hw/pvrdma: Make default pkey 0xFFFF

2018-11-12 Thread Yuval Shaia
Commit 6e7dba23af ("hw/pvrdma: Make default pkey 0x") exports
default pkey as external definition but omit the change from 0x7FFF to
0x.

Fixes: 6e7dba23af ("hw/pvrdma: Make default pkey 0x")

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index e3742d893a..15c3f28b86 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -52,7 +52,7 @@
 #define PVRDMA_FW_VERSION14
 
 /* Some defaults */
-#define PVRDMA_PKEY  0x7FFF
+#define PVRDMA_PKEY  0x
 
 typedef struct DSRInfo {
 dma_addr_t dma;
-- 
2.17.2




[Qemu-devel] [PATCH v3 19/23] vl: Introduce shutdown_notifiers

2018-11-12 Thread Yuval Shaia
Notifier will be used for signaling shutdown event to inform system is
shutdown. This will allow devices and other component to run some
cleanup code needed before VM is shutdown.

Signed-off-by: Yuval Shaia 
---
 include/sysemu/sysemu.h |  1 +
 vl.c| 15 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 8d6095d98b..0d15f16492 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -80,6 +80,7 @@ void qemu_register_wakeup_notifier(Notifier *notifier);
 void qemu_system_shutdown_request(ShutdownCause reason);
 void qemu_system_powerdown_request(void);
 void qemu_register_powerdown_notifier(Notifier *notifier);
+void qemu_register_shutdown_notifier(Notifier *notifier);
 void qemu_system_debug_request(void);
 void qemu_system_vmstop_request(RunState reason);
 void qemu_system_vmstop_request_prepare(void);
diff --git a/vl.c b/vl.c
index 1fcacc5caa..d33d52522c 100644
--- a/vl.c
+++ b/vl.c
@@ -1578,6 +1578,8 @@ static NotifierList suspend_notifiers =
 NOTIFIER_LIST_INITIALIZER(suspend_notifiers);
 static NotifierList wakeup_notifiers =
 NOTIFIER_LIST_INITIALIZER(wakeup_notifiers);
+static NotifierList shutdown_notifiers =
+NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
 static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
 
 ShutdownCause qemu_shutdown_requested_get(void)
@@ -1809,6 +1811,12 @@ static void qemu_system_powerdown(void)
 notifier_list_notify(_notifiers, NULL);
 }
 
+static void qemu_system_shutdown(ShutdownCause cause)
+{
+qapi_event_send_shutdown(shutdown_caused_by_guest(cause));
+notifier_list_notify(_notifiers, );
+}
+
 void qemu_system_powerdown_request(void)
 {
 trace_qemu_system_powerdown_request();
@@ -1821,6 +1829,11 @@ void qemu_register_powerdown_notifier(Notifier *notifier)
 notifier_list_add(_notifiers, notifier);
 }
 
+void qemu_register_shutdown_notifier(Notifier *notifier)
+{
+notifier_list_add(_notifiers, notifier);
+}
+
 void qemu_system_debug_request(void)
 {
 debug_requested = 1;
@@ -1848,7 +1861,7 @@ static bool main_loop_should_exit(void)
 request = qemu_shutdown_requested();
 if (request) {
 qemu_kill_report();
-qapi_event_send_shutdown(shutdown_caused_by_guest(request));
+qemu_system_shutdown(request);
 if (no_shutdown) {
 vm_stop(RUN_STATE_SHUTDOWN);
 } else {
-- 
2.17.2




[Qemu-devel] [PATCH v3 15/23] hw/pvrdma: Make device state depend on Ethernet function state

2018-11-12 Thread Yuval Shaia
User should be able to control the device by changing Ethernet function
state so if user runs 'ifconfig ens3 down' the PVRDMA function should be
down as well.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_cmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 2979582fac..0d3c818c20 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -139,7 +139,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.ack = PVRDMA_CMD_QUERY_PORT_RESP;
 resp->hdr.err = 0;
 
-resp->attrs.state = attrs.state;
+resp->attrs.state = dev->func0->device_active ? attrs.state :
+PVRDMA_PORT_DOWN;
 resp->attrs.max_mtu = attrs.max_mtu;
 resp->attrs.active_mtu = attrs.active_mtu;
 resp->attrs.phys_state = attrs.phys_state;
-- 
2.17.2




[Qemu-devel] [PATCH v3 18/23] hw/rdma: Remove unneeded code that handles more that one port

2018-11-12 Thread Yuval Shaia
Device supports only one port, let's remove a dead code that handles
more than one port.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_rm.c  | 34 --
 hw/rdma/rdma_rm.h  |  2 +-
 hw/rdma/rdma_rm_defs.h |  4 ++--
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index fe0979415d..0a5ab8935a 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -545,7 +545,7 @@ int rdma_rm_add_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 return -EINVAL;
 }
 
-memcpy(_res->ports[0].gid_tbl[gid_idx].gid, gid, sizeof(*gid));
+memcpy(_res->port.gid_tbl[gid_idx].gid, gid, sizeof(*gid));
 
 return 0;
 }
@@ -556,15 +556,15 @@ int rdma_rm_del_gid(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 int rc;
 
 rc = rdma_backend_del_gid(backend_dev, ifname,
-  _res->ports[0].gid_tbl[gid_idx].gid);
+  _res->port.gid_tbl[gid_idx].gid);
 if (rc < 0) {
 pr_dbg("Fail to delete gid\n");
 return -EINVAL;
 }
 
-memset(dev_res->ports[0].gid_tbl[gid_idx].gid.raw, 0,
-   sizeof(dev_res->ports[0].gid_tbl[gid_idx].gid));
-dev_res->ports[0].gid_tbl[gid_idx].backend_gid_index = -1;
+memset(dev_res->port.gid_tbl[gid_idx].gid.raw, 0,
+   sizeof(dev_res->port.gid_tbl[gid_idx].gid));
+dev_res->port.gid_tbl[gid_idx].backend_gid_index = -1;
 
 return 0;
 }
@@ -577,16 +577,16 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 return -EINVAL;
 }
 
-if (unlikely(dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index == -1)) 
{
-dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index =
+if (unlikely(dev_res->port.gid_tbl[sgid_idx].backend_gid_index == -1)) {
+dev_res->port.gid_tbl[sgid_idx].backend_gid_index =
 rdma_backend_get_gid_index(backend_dev,
-   
_res->ports[0].gid_tbl[sgid_idx].gid);
+   _res->port.gid_tbl[sgid_idx].gid);
 }
 
 pr_dbg("backend_gid_index=%d\n",
-   dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index);
+   dev_res->port.gid_tbl[sgid_idx].backend_gid_index);
 
-return dev_res->ports[0].gid_tbl[sgid_idx].backend_gid_index;
+return dev_res->port.gid_tbl[sgid_idx].backend_gid_index;
 }
 
 static void destroy_qp_hash_key(gpointer data)
@@ -596,15 +596,13 @@ static void destroy_qp_hash_key(gpointer data)
 
 static void init_ports(RdmaDeviceResources *dev_res)
 {
-int i, j;
+int i;
 
-memset(dev_res->ports, 0, sizeof(dev_res->ports));
+memset(_res->port, 0, sizeof(dev_res->port));
 
-for (i = 0; i < MAX_PORTS; i++) {
-dev_res->ports[i].state = IBV_PORT_DOWN;
-for (j = 0; j < MAX_PORT_GIDS; j++) {
-dev_res->ports[i].gid_tbl[j].backend_gid_index = -1;
-}
+dev_res->port.state = IBV_PORT_DOWN;
+for (i = 0; i < MAX_PORT_GIDS; i++) {
+dev_res->port.gid_tbl[i].backend_gid_index = -1;
 }
 }
 
@@ -613,7 +611,7 @@ static void fini_ports(RdmaDeviceResources *dev_res,
 {
 int i;
 
-dev_res->ports[0].state = IBV_PORT_DOWN;
+dev_res->port.state = IBV_PORT_DOWN;
 for (i = 0; i < MAX_PORT_GIDS; i++) {
 rdma_rm_del_gid(dev_res, backend_dev, ifname, i);
 }
diff --git a/hw/rdma/rdma_rm.h b/hw/rdma/rdma_rm.h
index a7169b4e89..3c602c04c0 100644
--- a/hw/rdma/rdma_rm.h
+++ b/hw/rdma/rdma_rm.h
@@ -79,7 +79,7 @@ int rdma_rm_get_backend_gid_index(RdmaDeviceResources 
*dev_res,
 static inline union ibv_gid *rdma_rm_get_gid(RdmaDeviceResources *dev_res,
  int sgid_idx)
 {
-return _res->ports[0].gid_tbl[sgid_idx].gid;
+return _res->port.gid_tbl[sgid_idx].gid;
 }
 
 #endif
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7b3435f991..0ba61d1838 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -18,7 +18,7 @@
 
 #include "rdma_backend_defs.h"
 
-#define MAX_PORTS 1
+#define MAX_PORTS 1 /* Do not change - we support only one port */
 #define MAX_PORT_GIDS 255
 #define MAX_GIDS  MAX_PORT_GIDS
 #define MAX_PORT_PKEYS1
@@ -97,7 +97,7 @@ typedef struct RdmaRmPort {
 } RdmaRmPort;
 
 typedef struct RdmaDeviceResources {
-RdmaRmPort ports[MAX_PORTS];
+RdmaRmPort port;
 RdmaRmResTbl pd_tbl;
 RdmaRmResTbl mr_tbl;
 RdmaRmResTbl uc_tbl;
-- 
2.17.2




[Qemu-devel] [PATCH v3 10/23] json: Define new QMP message for pvrdma

2018-11-12 Thread Yuval Shaia
pvrdma requires that the same GID attached to it will be attached to the
backend device in the host.

A new QMP messages is defined so pvrdma device can broadcast any change
made to its GID table. This event is captured by libvirt which in turn
will update the GID table in the backend device.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 MAINTAINERS   |  1 +
 Makefile  |  3 ++-
 Makefile.objs |  4 
 qapi/qapi-schema.json |  1 +
 qapi/rdma.json| 38 ++
 5 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 qapi/rdma.json

diff --git a/MAINTAINERS b/MAINTAINERS
index e087d58ac6..a149f68a8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2232,6 +2232,7 @@ F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
 F: contrib/rdmacm-mux/*
+F: qapi/rdma.json
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index 94072776ff..db4ce60ee5 100644
--- a/Makefile
+++ b/Makefile
@@ -599,7 +599,8 @@ qapi-modules = $(SRC_PATH)/qapi/qapi-schema.json 
$(SRC_PATH)/qapi/common.json \
$(SRC_PATH)/qapi/tpm.json \
$(SRC_PATH)/qapi/trace.json \
$(SRC_PATH)/qapi/transaction.json \
-   $(SRC_PATH)/qapi/ui.json
+   $(SRC_PATH)/qapi/ui.json \
+   $(SRC_PATH)/qapi/rdma.json
 
 qapi/qapi-builtin-types.c qapi/qapi-builtin-types.h \
 qapi/qapi-types.c qapi/qapi-types.h \
diff --git a/Makefile.objs b/Makefile.objs
index cc7df3ad80..76d8028f2f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -21,6 +21,7 @@ util-obj-y += qapi/qapi-types-tpm.o
 util-obj-y += qapi/qapi-types-trace.o
 util-obj-y += qapi/qapi-types-transaction.o
 util-obj-y += qapi/qapi-types-ui.o
+util-obj-y += qapi/qapi-types-rdma.o
 util-obj-y += qapi/qapi-builtin-visit.o
 util-obj-y += qapi/qapi-visit.o
 util-obj-y += qapi/qapi-visit-block-core.o
@@ -40,6 +41,7 @@ util-obj-y += qapi/qapi-visit-tpm.o
 util-obj-y += qapi/qapi-visit-trace.o
 util-obj-y += qapi/qapi-visit-transaction.o
 util-obj-y += qapi/qapi-visit-ui.o
+util-obj-y += qapi/qapi-visit-rdma.o
 util-obj-y += qapi/qapi-events.o
 util-obj-y += qapi/qapi-events-block-core.o
 util-obj-y += qapi/qapi-events-block.o
@@ -58,6 +60,7 @@ util-obj-y += qapi/qapi-events-tpm.o
 util-obj-y += qapi/qapi-events-trace.o
 util-obj-y += qapi/qapi-events-transaction.o
 util-obj-y += qapi/qapi-events-ui.o
+util-obj-y += qapi/qapi-events-rdma.o
 util-obj-y += qapi/qapi-introspect.o
 
 chardev-obj-y = chardev/
@@ -155,6 +158,7 @@ common-obj-y += qapi/qapi-commands-tpm.o
 common-obj-y += qapi/qapi-commands-trace.o
 common-obj-y += qapi/qapi-commands-transaction.o
 common-obj-y += qapi/qapi-commands-ui.o
+common-obj-y += qapi/qapi-commands-rdma.o
 common-obj-y += qapi/qapi-introspect.o
 common-obj-y += qmp.o hmp.o
 endif
diff --git a/qapi/qapi-schema.json b/qapi/qapi-schema.json
index 65b6dc2f6f..a650d80f83 100644
--- a/qapi/qapi-schema.json
+++ b/qapi/qapi-schema.json
@@ -94,3 +94,4 @@
 { 'include': 'trace.json' }
 { 'include': 'introspect.json' }
 { 'include': 'misc.json' }
+{ 'include': 'rdma.json' }
diff --git a/qapi/rdma.json b/qapi/rdma.json
new file mode 100644
index 00..804c68ab36
--- /dev/null
+++ b/qapi/rdma.json
@@ -0,0 +1,38 @@
+# -*- Mode: Python -*-
+#
+
+##
+# = RDMA device
+##
+
+##
+# @RDMA_GID_STATUS_CHANGED:
+#
+# Emitted when guest driver adds/deletes GID to/from device
+#
+# @netdev: RoCE Network Device name - char *
+#
+# @gid-status: Add or delete indication - bool
+#
+# @subnet-prefix: Subnet Prefix - uint64
+#
+# @interface-id : Interface ID - uint64
+#
+# Since: 3.2
+#
+# Example:
+#
+# <- {"timestamp": {"seconds": 1541579657, "microseconds": 986760},
+# "event": "RDMA_GID_STATUS_CHANGED",
+# "data":
+# {"netdev": "bridge0",
+# "interface-id": 15880512517475447892,
+# "gid-status": true,
+# "subnet-prefix": 33022}}
+#
+##
+{ 'event': 'RDMA_GID_STATUS_CHANGED',
+  'data': { 'netdev': 'str',
+'gid-status': 'bool',
+'subnet-prefix' : 'uint64',
+'interface-id'  : 'uint64' } }
-- 
2.17.2




[Qemu-devel] [PATCH v3 17/23] hw/pvrdma: Fill error code in command's response

2018-11-12 Thread Yuval Shaia
Driver checks error code let's set it.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_cmd.c | 67 
 1 file changed, 48 insertions(+), 19 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index 0d3c818c20..a326c5d470 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -131,7 +131,8 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 if (rdma_backend_query_port(>backend_dev,
 (struct ibv_port_attr *))) {
-return -ENOMEM;
+resp->hdr.err = -ENOMEM;
+goto out;
 }
 
 memset(resp, 0, sizeof(*resp));
@@ -150,7 +151,9 @@ static int query_port(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->attrs.active_width = 1;
 resp->attrs.active_speed = 1;
 
-return 0;
+out:
+pr_dbg("ret=%d\n", resp->hdr.err);
+return resp->hdr.err;
 }
 
 static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -170,7 +173,7 @@ static int query_pkey(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->pkey = PVRDMA_PKEY;
 pr_dbg("pkey=0x%x\n", resp->pkey);
 
-return 0;
+return resp->hdr.err;
 }
 
 static int create_pd(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -200,7 +203,9 @@ static int destroy_pd(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_pd(>rdma_dev_res, cmd->pd_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_mr(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -251,7 +256,9 @@ static int destroy_mr(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_mr(>rdma_dev_res, cmd->mr_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+return rsp->hdr.err;
 }
 
 static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
@@ -353,7 +360,8 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 cq = rdma_rm_get_cq(>rdma_dev_res, cmd->cq_handle);
 if (!cq) {
 pr_dbg("Invalid CQ handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 ring = (PvrdmaRing *)cq->opaque;
@@ -364,7 +372,11 @@ static int destroy_cq(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 
 rdma_rm_dealloc_cq(>rdma_dev_res, cmd->cq_handle);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
@@ -553,7 +565,8 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 qp = rdma_rm_get_qp(>rdma_dev_res, cmd->qp_handle);
 if (!qp) {
 pr_dbg("Invalid QP handle\n");
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rdma_rm_dealloc_qp(>rdma_dev_res, cmd->qp_handle);
@@ -567,7 +580,11 @@ static int destroy_qp(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 rdma_pci_dma_unmap(PCI_DEVICE(dev), ring->ring_state, TARGET_PAGE_SIZE);
 g_free(ring);
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -580,7 +597,8 @@ static int create_bind(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 pr_dbg("gid[%d]=0x%llx,0x%llx\n", cmd->index,
@@ -590,10 +608,15 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 rc = rdma_rm_add_gid(>rdma_dev_res, >backend_dev,
  dev->backend_eth_device_name, gid, cmd->index);
 if (rc < 0) {
-return -EINVAL;
+rsp->hdr.err = rc;
+goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_bind(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -606,7 +629,8 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 pr_dbg("index=%d\n", cmd->index);
 
 if (cmd->index >= MAX_PORT_GIDS) {
-return -EINVAL;
+rsp->hdr.err = -EINVAL;
+goto out;
 }
 
 rc = rdma_rm_del_gid(>rdma_dev_res, >backend_dev,
@@ -617,7 +641,11 @@ static int destroy_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 goto out;
 }
 
-return 0;
+rsp->hdr.err = 0;
+
+out:
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req *req,
@@ -634,9 +662,8 @@ static int create_uc(PVRDMADev *dev, union pvrdma_cmd_req 
*req,
 resp->hdr.err = rdma_rm_alloc_uc(>rdma_dev_res, cmd->pfn,
  >ctx_handle);
 
-pr_dbg("ret=%d\n", resp->hdr.err);
-
-return 0;
+pr_dbg("ret=%d\n", rsp->hdr.err);
+return rsp->hdr.err;
 }
 
 static int destroy_uc(PVRDMADev *dev, union 

[Qemu-devel] [PATCH v3 07/23] hw/pvrdma: Make default pkey 0xFFFF

2018-11-12 Thread Yuval Shaia
Commit 6e7dba23af ("hw/pvrdma: Make default pkey 0x") exports
default pkey as external definition but omit the change from 0x7FFF to
0x.

Fixes: 6e7dba23af ("hw/pvrdma: Make default pkey 0x")

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index e3742d893a..15c3f28b86 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -52,7 +52,7 @@
 #define PVRDMA_FW_VERSION14
 
 /* Some defaults */
-#define PVRDMA_PKEY  0x7FFF
+#define PVRDMA_PKEY  0x
 
 typedef struct DSRInfo {
 dma_addr_t dma;
-- 
2.17.2




[Qemu-devel] [PATCH v3 16/23] hw/pvrdma: Fill all CQE fields

2018-11-12 Thread Yuval Shaia
Add ability to pass specific WC attributes to CQE such as GRH_BIT flag.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 59 +++--
 hw/rdma/rdma_backend.h  |  4 +--
 hw/rdma/vmw/pvrdma_qp_ops.c | 31 +++
 3 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 5675504165..e453bda8f9 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -59,13 +59,24 @@ struct backend_umad {
 char mad[RDMA_MAX_PRIVATE_DATA];
 };
 
-static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
+static void (*comp_handler)(void *ctx, struct ibv_wc *wc);
 
-static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
+static void dummy_comp_handler(void *ctx, struct ibv_wc *wc)
 {
 pr_err("No completion handler is registered\n");
 }
 
+static inline void complete_work(enum ibv_wc_status status, uint32_t 
vendor_err,
+ void *ctx)
+{
+struct ibv_wc wc = {0};
+
+wc.status = status;
+wc.vendor_err = vendor_err;
+
+comp_handler(ctx, );
+}
+
 static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
 {
 int i, ne;
@@ -90,7 +101,7 @@ static void poll_cq(RdmaDeviceResources *rdma_dev_res, 
struct ibv_cq *ibcq)
 }
 pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv");
 
-comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx);
+comp_handler(bctx->up_ctx, [i]);
 
 rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
 g_free(bctx);
@@ -184,8 +195,8 @@ static void start_comp_thread(RdmaBackendDev *backend_dev)
comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
 }
 
-void rdma_backend_register_comp_handler(void (*handler)(int status,
-unsigned int vendor_err, void *ctx))
+void rdma_backend_register_comp_handler(void (*handler)(void *ctx,
+ struct ibv_wc *wc))
 {
 comp_handler = handler;
 }
@@ -369,14 +380,14 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
 if (qp_type == IBV_QPT_SMI) {
 pr_dbg("QP0 unsupported\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
 rc = mad_send(backend_dev, sgid_idx, sgid, sge, num_sge);
 if (rc) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
 } else {
-comp_handler(IBV_WC_SUCCESS, 0, ctx);
+complete_work(IBV_WC_SUCCESS, 0, ctx);
 }
 }
 return;
@@ -385,7 +396,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 pr_dbg("num_sge=%d\n", num_sge);
 if (!num_sge) {
 pr_dbg("num_sge=0\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
 return;
 }
 
@@ -396,21 +407,21 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx);
 if (unlikely(rc)) {
 pr_dbg("Failed to allocate cqe_ctx\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
 goto out_free_bctx;
 }
 
 rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, 
num_sge);
 if (rc) {
 pr_dbg("Error: Failed to build host SGE array\n");
-comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
+complete_work(IBV_WC_GENERAL_ERR, rc, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd, sgid_idx, dgid);
 if (!wr.wr.ud.ah) {
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 wr.wr.ud.remote_qpn = dqpn;
@@ -428,7 +439,7 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (rc) {
 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno,
 qp->ibqp->qp_num);
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+complete_work(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
 goto out_dealloc_cqe_ctx;
 }
 
@@ -497,13 +508,13 @@ void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
 if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
   

[Qemu-devel] [PATCH v3 11/23] hw/pvrdma: Add support to allow guest to configure GID table

2018-11-12 Thread Yuval Shaia
The control over the RDMA device's GID table is done by updating the
device's Ethernet function addresses.
Usually the first GID entry is determine by the MAC address, the second
by the first IPv6 address and the third by the IPv4 address. Other
entries can be added by adding more IP addresses. The opposite is the
same, i.e. whenever an address is removed, the corresponding GID entry
is removed.

The process is done by the network and RDMA stacks. Whenever an address
is added the ib_core driver is notified and calls the device driver
add_gid function which in turn update the device.

To support this in pvrdma device we need to hook into the create_bind
and destroy_bind HW commands triggered by pvrdma driver in guest.
Whenever a changed is made to the pvrdma device's GID table a special
QMP messages is sent to be processed by libvirt to update the address of
the backend Ethernet device.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 243 +++-
 hw/rdma/rdma_backend.h  |  22 ++--
 hw/rdma/rdma_backend_defs.h |   3 +-
 hw/rdma/rdma_rm.c   | 104 ++-
 hw/rdma/rdma_rm.h   |  17 ++-
 hw/rdma/rdma_rm_defs.h  |   9 +-
 hw/rdma/rdma_utils.h|  15 +++
 hw/rdma/vmw/pvrdma.h|   2 +-
 hw/rdma/vmw/pvrdma_cmd.c|  55 
 hw/rdma/vmw/pvrdma_main.c   |  25 +---
 hw/rdma/vmw/pvrdma_qp_ops.c |  20 +++
 11 files changed, 370 insertions(+), 145 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 3eb0099f8d..5675504165 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -18,12 +18,14 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qlist.h"
 #include "qapi/qmp/qnum.h"
+#include "qapi/qapi-events-rdma.h"
 
 #include 
 #include 
 #include 
 #include 
 
+#include "contrib/rdmacm-mux/rdmacm-mux.h"
 #include "trace.h"
 #include "rdma_utils.h"
 #include "rdma_rm.h"
@@ -300,11 +302,11 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
-static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
-uint32_t num_sge)
+static int mad_send(RdmaBackendDev *backend_dev, uint8_t sgid_idx,
+union ibv_gid *sgid, struct ibv_sge *sge, uint32_t num_sge)
 {
-struct backend_umad umad = {0};
-char *hdr, *msg;
+RdmaCmMuxMsg msg = {0};
+char *hdr, *data;
 int ret;
 
 pr_dbg("num_sge=%d\n", num_sge);
@@ -313,41 +315,50 @@ static int mad_send(RdmaBackendDev *backend_dev, struct 
ibv_sge *sge,
 return -EINVAL;
 }
 
-umad.hdr.length = sge[0].length + sge[1].length;
-pr_dbg("msg_len=%d\n", umad.hdr.length);
+msg.hdr.msg_type = RDMACM_MUX_MSG_TYPE_MAD;
+memcpy(msg.hdr.sgid.raw, sgid->raw, sizeof(msg.hdr.sgid));
 
-if (umad.hdr.length > sizeof(umad.mad)) {
+msg.umad_len = sge[0].length + sge[1].length;
+pr_dbg("umad_len=%d\n", msg.umad_len);
+
+if (msg.umad_len > sizeof(msg.umad.mad)) {
 return -ENOMEM;
 }
 
-umad.hdr.addr.qpn = htobe32(1);
-umad.hdr.addr.grh_present = 1;
-umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
-memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
-umad.hdr.addr.hop_limit = 1;
+msg.umad.hdr.addr.qpn = htobe32(1);
+msg.umad.hdr.addr.grh_present = 1;
+pr_dbg("sgid_idx=%d\n", sgid_idx);
+pr_dbg("sgid=0x%llx\n", sgid->global.interface_id);
+msg.umad.hdr.addr.gid_index = sgid_idx;
+memcpy(msg.umad.hdr.addr.gid, sgid->raw, sizeof(msg.umad.hdr.addr.gid));
+msg.umad.hdr.addr.hop_limit = 1;
 
 hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
-msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+data = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+
+pr_dbg_buf("mad_hdr", hdr, sge[0].length);
+pr_dbg_buf("mad_data", data, sge[1].length);
 
-memcpy([0], hdr, sge[0].length);
-memcpy([sge[0].length], msg, sge[1].length);
+memcpy([0], hdr, sge[0].length);
+memcpy([sge[0].length], data, sge[1].length);
 
-rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+rdma_pci_dma_unmap(backend_dev->dev, data, sge[1].length);
 rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
 
-ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
-sizeof(umad));
+ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
+sizeof(msg));
 
 pr_dbg("qemu_chr_fe_write=%d\n", ret);
 
-return (ret != sizeof(umad));
+return (ret != sizeof(msg));
 }
 
 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 RdmaBackendQP *qp, uint8_t qp_type,
 struct ibv_sge *sge, uint32_t num_sge,
-union ibv_gid *dgid, uint32_t dqpn,
-uint32_t dqkey, void *ctx)
+   

[Qemu-devel] [PATCH v3 08/23] hw/pvrdma: Set the correct opcode for recv completion

2018-11-12 Thread Yuval Shaia
The function pvrdma_post_cqe populates CQE entry with opcode from the
given completion element. For receive operation value was not set. Fix
it by setting it to IBV_WC_RECV.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 762700a205..7b0f440fda 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -196,8 +196,9 @@ int pvrdma_qp_recv(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx = g_malloc(sizeof(CompHandlerCtx));
 comp_ctx->dev = dev;
 comp_ctx->cq_handle = qp->recv_cq_handle;
-comp_ctx->cqe.qp = qp_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
+comp_ctx->cqe.qp = qp_handle;
+comp_ctx->cqe.opcode = IBV_WC_RECV;
 
 rdma_backend_post_recv(>backend_dev, >rdma_dev_res,
>backend_qp, qp->qp_type,
-- 
2.17.2




[Qemu-devel] [PATCH v3 05/23] hw/rdma: Add support for MAD packets

2018-11-12 Thread Yuval Shaia
MAD (Management Datagram) packets are widely used by various modules
both in kernel and in user space for example the rdma_* API which is
used to create and maintain "connection" layer on top of RDMA uses
several types of MAD packets.
To support MAD packets the device uses an external utility
(contrib/rdmacm-mux) to relay packets from and to the guest driver.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.c  | 263 +++-
 hw/rdma/rdma_backend.h  |   4 +-
 hw/rdma/rdma_backend_defs.h |  10 +-
 hw/rdma/vmw/pvrdma.h|   2 +
 hw/rdma/vmw/pvrdma_main.c   |   4 +-
 5 files changed, 273 insertions(+), 10 deletions(-)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index 1e148398a2..3eb0099f8d 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -16,8 +16,13 @@
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qnum.h"
 
 #include 
+#include 
+#include 
+#include 
 
 #include "trace.h"
 #include "rdma_utils.h"
@@ -33,16 +38,25 @@
 #define VENDOR_ERR_MAD_SEND 0x206
 #define VENDOR_ERR_INVLKEY  0x207
 #define VENDOR_ERR_MR_SMALL 0x208
+#define VENDOR_ERR_INV_MAD_BUFF 0x209
+#define VENDOR_ERR_INV_NUM_SGE  0x210
 
 #define THR_NAME_LEN 16
 #define THR_POLL_TO  5000
 
+#define MAD_HDR_SIZE sizeof(struct ibv_grh)
+
 typedef struct BackendCtx {
-uint64_t req_id;
 void *up_ctx;
 bool is_tx_req;
+struct ibv_sge sge; /* Used to save MAD recv buffer */
 } BackendCtx;
 
+struct backend_umad {
+struct ib_user_mad hdr;
+char mad[RDMA_MAX_PRIVATE_DATA];
+};
+
 static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
 
 static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
@@ -286,6 +300,49 @@ static int build_host_sge_array(RdmaDeviceResources 
*rdma_dev_res,
 return 0;
 }
 
+static int mad_send(RdmaBackendDev *backend_dev, struct ibv_sge *sge,
+uint32_t num_sge)
+{
+struct backend_umad umad = {0};
+char *hdr, *msg;
+int ret;
+
+pr_dbg("num_sge=%d\n", num_sge);
+
+if (num_sge != 2) {
+return -EINVAL;
+}
+
+umad.hdr.length = sge[0].length + sge[1].length;
+pr_dbg("msg_len=%d\n", umad.hdr.length);
+
+if (umad.hdr.length > sizeof(umad.mad)) {
+return -ENOMEM;
+}
+
+umad.hdr.addr.qpn = htobe32(1);
+umad.hdr.addr.grh_present = 1;
+umad.hdr.addr.gid_index = backend_dev->backend_gid_idx;
+memcpy(umad.hdr.addr.gid, backend_dev->gid.raw, sizeof(umad.hdr.addr.gid));
+umad.hdr.addr.hop_limit = 1;
+
+hdr = rdma_pci_dma_map(backend_dev->dev, sge[0].addr, sge[0].length);
+msg = rdma_pci_dma_map(backend_dev->dev, sge[1].addr, sge[1].length);
+
+memcpy([0], hdr, sge[0].length);
+memcpy([sge[0].length], msg, sge[1].length);
+
+rdma_pci_dma_unmap(backend_dev->dev, msg, sge[1].length);
+rdma_pci_dma_unmap(backend_dev->dev, hdr, sge[0].length);
+
+ret = qemu_chr_fe_write(backend_dev->mad_chr_be, (const uint8_t *),
+sizeof(umad));
+
+pr_dbg("qemu_chr_fe_write=%d\n", ret);
+
+return (ret != sizeof(umad));
+}
+
 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 RdmaBackendQP *qp, uint8_t qp_type,
 struct ibv_sge *sge, uint32_t num_sge,
@@ -304,9 +361,13 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
 } else if (qp_type == IBV_QPT_GSI) {
 pr_dbg("QP1\n");
-comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+rc = mad_send(backend_dev, sge, num_sge);
+if (rc) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
+} else {
+comp_handler(IBV_WC_SUCCESS, 0, ctx);
+}
 }
-pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
 return;
 }
 
@@ -370,6 +431,48 @@ out_free_bctx:
 g_free(bctx);
 }
 
+static unsigned int save_mad_recv_buffer(RdmaBackendDev *backend_dev,
+ struct ibv_sge *sge, uint32_t num_sge,
+ void *ctx)
+{
+BackendCtx *bctx;
+int rc;
+uint32_t bctx_id;
+
+if (num_sge != 1) {
+pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge);
+return VENDOR_ERR_INV_NUM_SGE;
+}
+
+if (sge[0].length < RDMA_MAX_PRIVATE_DATA + sizeof(struct ibv_grh)) {
+pr_dbg("Too small buffer for MAD\n");
+return VENDOR_ERR_INV_MAD_BUFF;
+}
+
+pr_dbg("addr=0x%" PRIx64"\n", sge[0].addr);
+pr_dbg("length=%d\n", sge[0].length);
+pr_dbg("lkey=%d\n", sge[0].lkey);
+
+bctx = g_malloc0(sizeof(*bctx));
+
+rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, _id, bctx);
+if (unlikely(rc)) {
+

[Qemu-devel] [PATCH v3 14/23] hw/rdma: Initialize node_guid from vmxnet3 mac address

2018-11-12 Thread Yuval Shaia
node_guid should be set once device is load.
Make node_guid be GID format (32 bit) of PCI function 0 vmxnet3 device's
MAC.

A new function was added to do the conversion.
So for example the MAC 56:b6:44:e9:62:dc will be converted to GID
54b6:44ff:fee9:62dc.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_utils.h  |  9 +
 hw/rdma/vmw/pvrdma_cmd.c  | 10 --
 hw/rdma/vmw/pvrdma_main.c |  5 -
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/hw/rdma/rdma_utils.h b/hw/rdma/rdma_utils.h
index 989db249ef..202abb3366 100644
--- a/hw/rdma/rdma_utils.h
+++ b/hw/rdma/rdma_utils.h
@@ -63,4 +63,13 @@ extern unsigned long pr_dbg_cnt;
 void *rdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen);
 void rdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len);
 
+static inline void addrconf_addr_eui48(uint8_t *eui, const char *addr)
+{
+memcpy(eui, addr, 3);
+eui[3] = 0xFF;
+eui[4] = 0xFE;
+memcpy(eui + 5, addr + 3, 3);
+eui[0] ^= 2;
+}
+
 #endif
diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index a334f6205e..2979582fac 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -592,16 +592,6 @@ static int create_bind(PVRDMADev *dev, union 
pvrdma_cmd_req *req,
 return -EINVAL;
 }
 
-/* TODO: Since drivers stores node_guid at load_dsr phase then this
- * assignment is not relevant, i need to figure out a way how to
- * retrieve MAC of our netdev */
-if (!cmd->index) {
-dev->node_guid =
-dev->rdma_dev_res.ports[0].gid_tbl[0].gid.global.interface_id;
-pr_dbg("dev->node_guid=0x%llx\n",
-   (long long unsigned int)be64_to_cpu(dev->node_guid));
-}
-
 return 0;
 }
 
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index fa6468d221..95e9322b7c 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -264,7 +264,7 @@ static void init_dsr_dev_caps(PVRDMADev *dev)
 dsr->caps.sys_image_guid = 0;
 pr_dbg("sys_image_guid=%" PRIx64 "\n", dsr->caps.sys_image_guid);
 
-dsr->caps.node_guid = cpu_to_be64(dev->node_guid);
+dsr->caps.node_guid = dev->node_guid;
 pr_dbg("node_guid=%" PRIx64 "\n", be64_to_cpu(dsr->caps.node_guid));
 
 dsr->caps.phys_port_cnt = MAX_PORTS;
@@ -579,6 +579,9 @@ static void pvrdma_realize(PCIDevice *pdev, Error **errp)
 /* Break if not vmxnet3 device in slot 0 */
 dev->func0 = VMXNET3(pci_get_function_0(pdev));
 
+addrconf_addr_eui48((unsigned char *)>node_guid,
+(const char *)>func0->conf.macaddr.a);
+
 memdev_root = object_resolve_path("/objects", NULL);
 if (memdev_root) {
 object_child_foreach(memdev_root, pvrdma_check_ram_shared, 
_shared);
-- 
2.17.2




[Qemu-devel] [PATCH v3 03/23] hw/rdma: Return qpn 1 if ibqp is NULL

2018-11-12 Thread Yuval Shaia
Device is not supporting QP0, only QP1.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/rdma_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/rdma_backend.h b/hw/rdma/rdma_backend.h
index 86e8fe8ab6..3ccc9a2494 100644
--- a/hw/rdma/rdma_backend.h
+++ b/hw/rdma/rdma_backend.h
@@ -33,7 +33,7 @@ static inline union ibv_gid *rdma_backend_gid(RdmaBackendDev 
*dev)
 
 static inline uint32_t rdma_backend_qpn(const RdmaBackendQP *qp)
 {
-return qp->ibqp ? qp->ibqp->qp_num : 0;
+return qp->ibqp ? qp->ibqp->qp_num : 1;
 }
 
 static inline uint32_t rdma_backend_mr_lkey(const RdmaBackendMR *mr)
-- 
2.17.2




[Qemu-devel] [PATCH v3 06/23] hw/pvrdma: Make function reset_device return void

2018-11-12 Thread Yuval Shaia
This function cannot fail - fix it to return void

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/vmw/pvrdma_main.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index 6c8c0154fa..fc2abd34af 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -369,13 +369,11 @@ static int unquiesce_device(PVRDMADev *dev)
 return 0;
 }
 
-static int reset_device(PVRDMADev *dev)
+static void reset_device(PVRDMADev *dev)
 {
 pvrdma_stop(dev);
 
 pr_dbg("Device reset complete\n");
-
-return 0;
 }
 
 static uint64_t regs_read(void *opaque, hwaddr addr, unsigned size)
-- 
2.17.2




[Qemu-devel] [PATCH v3 09/23] hw/pvrdma: Set the correct opcode for send completion

2018-11-12 Thread Yuval Shaia
opcode for WC should be set by the device and not taken from work
element.

Signed-off-by: Yuval Shaia 
---
 hw/rdma/vmw/pvrdma_qp_ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index 7b0f440fda..3388be1926 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -154,7 +154,7 @@ int pvrdma_qp_send(PVRDMADev *dev, uint32_t qp_handle)
 comp_ctx->cq_handle = qp->send_cq_handle;
 comp_ctx->cqe.wr_id = wqe->hdr.wr_id;
 comp_ctx->cqe.qp = qp_handle;
-comp_ctx->cqe.opcode = wqe->hdr.opcode;
+comp_ctx->cqe.opcode = IBV_WC_SEND;
 
 rdma_backend_post_send(>backend_dev, >backend_qp, qp->qp_type,
(struct ibv_sge *)>sge[0], 
wqe->hdr.num_sge,
-- 
2.17.2




[Qemu-devel] [PATCH v3 12/23] vmxnet3: Move some definitions to header file

2018-11-12 Thread Yuval Shaia
pvrdma setup requires vmxnet3 device on PCI function 0 and PVRDMA device
on PCI function 1.
pvrdma device needs to access vmxnet3 device object for several reasons:
1. Make sure PCI function 0 is vmxnet3.
2. To monitor vmxnet3 device state.
3. To configure node_guid accoring to vmxnet3 device's MAC address.

To be able to access vmxnet3 device the definition of VMXNET3State is
moved to a new header file.

Signed-off-by: Yuval Shaia 
Reviewed-by: Dmitry Fleytman 
---
 hw/net/vmxnet3.c  | 116 +---
 hw/net/vmxnet3_defs.h | 133 ++
 2 files changed, 134 insertions(+), 115 deletions(-)
 create mode 100644 hw/net/vmxnet3_defs.h

diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 3648630386..54746a4030 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -18,7 +18,6 @@
 #include "qemu/osdep.h"
 #include "hw/hw.h"
 #include "hw/pci/pci.h"
-#include "net/net.h"
 #include "net/tap.h"
 #include "net/checksum.h"
 #include "sysemu/sysemu.h"
@@ -29,6 +28,7 @@
 #include "migration/register.h"
 
 #include "vmxnet3.h"
+#include "vmxnet3_defs.h"
 #include "vmxnet_debug.h"
 #include "vmware_utils.h"
 #include "net_tx_pkt.h"
@@ -131,23 +131,11 @@ typedef struct VMXNET3Class {
 DeviceRealize parent_dc_realize;
 } VMXNET3Class;
 
-#define TYPE_VMXNET3 "vmxnet3"
-#define VMXNET3(obj) OBJECT_CHECK(VMXNET3State, (obj), TYPE_VMXNET3)
-
 #define VMXNET3_DEVICE_CLASS(klass) \
 OBJECT_CLASS_CHECK(VMXNET3Class, (klass), TYPE_VMXNET3)
 #define VMXNET3_DEVICE_GET_CLASS(obj) \
 OBJECT_GET_CLASS(VMXNET3Class, (obj), TYPE_VMXNET3)
 
-/* Cyclic ring abstraction */
-typedef struct {
-hwaddr pa;
-uint32_t size;
-uint32_t cell_size;
-uint32_t next;
-uint8_t gen;
-} Vmxnet3Ring;
-
 static inline void vmxnet3_ring_init(PCIDevice *d,
 Vmxnet3Ring *ring,
  hwaddr pa,
@@ -245,108 +233,6 @@ vmxnet3_dump_rx_descr(struct Vmxnet3_RxDesc *descr)
   descr->rsvd, descr->dtype, descr->ext1, descr->btype);
 }
 
-/* Device state and helper functions */
-#define VMXNET3_RX_RINGS_PER_QUEUE (2)
-
-typedef struct {
-Vmxnet3Ring tx_ring;
-Vmxnet3Ring comp_ring;
-
-uint8_t intr_idx;
-hwaddr tx_stats_pa;
-struct UPT1_TxStats txq_stats;
-} Vmxnet3TxqDescr;
-
-typedef struct {
-Vmxnet3Ring rx_ring[VMXNET3_RX_RINGS_PER_QUEUE];
-Vmxnet3Ring comp_ring;
-uint8_t intr_idx;
-hwaddr rx_stats_pa;
-struct UPT1_RxStats rxq_stats;
-} Vmxnet3RxqDescr;
-
-typedef struct {
-bool is_masked;
-bool is_pending;
-bool is_asserted;
-} Vmxnet3IntState;
-
-typedef struct {
-PCIDevice parent_obj;
-NICState *nic;
-NICConf conf;
-MemoryRegion bar0;
-MemoryRegion bar1;
-MemoryRegion msix_bar;
-
-Vmxnet3RxqDescr rxq_descr[VMXNET3_DEVICE_MAX_RX_QUEUES];
-Vmxnet3TxqDescr txq_descr[VMXNET3_DEVICE_MAX_TX_QUEUES];
-
-/* Whether MSI-X support was installed successfully */
-bool msix_used;
-hwaddr drv_shmem;
-hwaddr temp_shared_guest_driver_memory;
-
-uint8_t txq_num;
-
-/* This boolean tells whether RX packet being indicated has to */
-/* be split into head and body chunks from different RX rings  */
-bool rx_packets_compound;
-
-bool rx_vlan_stripping;
-bool lro_supported;
-
-uint8_t rxq_num;
-
-/* Network MTU */
-uint32_t mtu;
-
-/* Maximum number of fragments for indicated TX packets */
-uint32_t max_tx_frags;
-
-/* Maximum number of fragments for indicated RX packets */
-uint16_t max_rx_frags;
-
-/* Index for events interrupt */
-uint8_t event_int_idx;
-
-/* Whether automatic interrupts masking enabled */
-bool auto_int_masking;
-
-bool peer_has_vhdr;
-
-/* TX packets to QEMU interface */
-struct NetTxPkt *tx_pkt;
-uint32_t offload_mode;
-uint32_t cso_or_gso_size;
-uint16_t tci;
-bool needs_vlan;
-
-struct NetRxPkt *rx_pkt;
-
-bool tx_sop;
-bool skip_current_tx_pkt;
-
-uint32_t device_active;
-uint32_t last_command;
-
-uint32_t link_status_and_speed;
-
-Vmxnet3IntState interrupt_states[VMXNET3_MAX_INTRS];
-
-uint32_t temp_mac;   /* To store the low part first */
-
-MACAddr perm_mac;
-uint32_t vlan_table[VMXNET3_VFT_SIZE];
-uint32_t rx_mode;
-MACAddr *mcast_list;
-uint32_t mcast_list_len;
-uint32_t mcast_list_buff_size; /* needed for live migration. */
-
-/* Compatibility flags for migration */
-uint32_t compat_flags;
-} VMXNET3State;
-
 /* Interrupt management */
 
 /*
diff --git a/hw/net/vmxnet3_defs.h b/hw/net/vmxnet3_defs.h
new file mode 100644
index 00..6c19d29b12
--- /dev/null
+++ b/hw/net/vmxnet3_defs.h
@@ -0,0 +1,133 @@
+/*
+ * 

[Qemu-devel] [PATCH v3 01/23] contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer

2018-11-12 Thread Yuval Shaia
RDMA MAD kernel module (ibcm) disallow more than one MAD-agent for a
given MAD class.
This does not go hand-by-hand with qemu pvrdma device's requirements
where each VM is MAD agent.
Fix it by adding implementation of RDMA MAD multiplexer service which on
one hand register as a sole MAD agent with the kernel module and on the
other hand gives service to more than one VM.

Design Overview:

A server process is registered to UMAD framework (for this to work the
rdma_cm kernel module needs to be unloaded) and creates a unix socket to
listen to incoming request from clients.
A client process (such as QEMU) connects to this unix socket and
registers with its own GID.

TX:
---
When client needs to send rdma_cm MAD message it construct it the same
way as without this multiplexer, i.e. creates a umad packet but this
time it writes its content to the socket instead of calling umad_send().
The server, upon receiving such a message fetch local_comm_id from it so
a context for this session can be maintain and relay the message to UMAD
layer by calling umad_send().

RX:
---
The server creates a worker thread to process incoming rdma_cm MAD
messages. When an incoming message arrived (umad_recv()) the server,
depending on the message type (attr_id) looks for target client by
either searching in gid->fd table or in local_comm_id->fd table. With
the extracted fd the server relays to incoming message to the client.

Signed-off-by: Yuval Shaia 
---
 MAINTAINERS  |   1 +
 Makefile |   3 +
 Makefile.objs|   1 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 771 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 6 files changed, 836 insertions(+)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 100644 contrib/rdmacm-mux/main.c
 create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 98a1856afc..e087d58ac6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2231,6 +2231,7 @@ S: Maintained
 F: hw/rdma/*
 F: hw/rdma/vmw/*
 F: docs/pvrdma.txt
+F: contrib/rdmacm-mux/*
 
 Build and test automation
 -
diff --git a/Makefile b/Makefile
index f2947186a4..94072776ff 100644
--- a/Makefile
+++ b/Makefile
@@ -418,6 +418,7 @@ dummy := $(call unnest-vars,, \
 elf2dmp-obj-y \
 ivshmem-client-obj-y \
 ivshmem-server-obj-y \
+rdmacm-mux-obj-y \
 libvhost-user-obj-y \
 vhost-user-scsi-obj-y \
 vhost-user-blk-obj-y \
@@ -725,6 +726,8 @@ vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) 
libvhost-user.a
$(call LINK, $^)
 vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
$(call LINK, $^)
+rdmacm-mux$(EXESUF): $(rdmacm-mux-obj-y) $(COMMON_LDADDS)
+   $(call LINK, $^)
 
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
$(call quiet-command,$(PYTHON) $< $@ \
diff --git a/Makefile.objs b/Makefile.objs
index 1e1ff387d7..cc7df3ad80 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -194,6 +194,7 @@ vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
 vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
 vhost-user-blk-obj-y = contrib/vhost-user-blk/
+rdmacm-mux-obj-y = contrib/rdmacm-mux/
 
 ##
 trace-events-subdirs =
diff --git a/contrib/rdmacm-mux/Makefile.objs b/contrib/rdmacm-mux/Makefile.objs
new file mode 100644
index 00..be3eacb6f7
--- /dev/null
+++ b/contrib/rdmacm-mux/Makefile.objs
@@ -0,0 +1,4 @@
+ifdef CONFIG_PVRDMA
+CFLAGS += -libumad -Wno-format-truncation
+rdmacm-mux-obj-y = main.o
+endif
diff --git a/contrib/rdmacm-mux/main.c b/contrib/rdmacm-mux/main.c
new file mode 100644
index 00..47cf0ac7bc
--- /dev/null
+++ b/contrib/rdmacm-mux/main.c
@@ -0,0 +1,771 @@
+/*
+ * QEMU paravirtual RDMA - rdmacm-mux implementation
+ *
+ * Copyright (C) 2018 Oracle
+ * Copyright (C) 2018 Red Hat Inc
+ *
+ * Authors:
+ * Yuval Shaia 
+ * Marcel Apfelbaum 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "sys/poll.h"
+#include "sys/ioctl.h"
+#include "pthread.h"
+#include "syslog.h"
+
+#include "infiniband/verbs.h"
+#include "infiniband/umad.h"
+#include "infiniband/umad_types.h"
+#include "infiniband/umad_sa.h"
+#include "infiniband/umad_cm.h"
+
+#include "rdmacm-mux.h"
+
+#define SCALE_US 1000
+#define COMMID_TTL 2 /* How many SCALE_US a context of MAD session is saved */
+#define SLEEP_SECS 5 /* This is used both in poll() and thread */
+#define SERVER_LISTEN_BACKLOG 10
+#define MAX_CLIENTS 4096
+#define MAD_RMPP_VERSION 0
+#define MAD_METHOD_MASK0 0x8
+
+#define IB_USER_MAD_LONGS_PER_METHOD_MASK (128 / (8 * 

[Qemu-devel] [PATCH v3 04/23] hw/rdma: Abort send-op if fail to create addr handler

2018-11-12 Thread Yuval Shaia
Function create_ah might return NULL, let's exit with an error.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/rdma_backend.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/rdma/rdma_backend.c b/hw/rdma/rdma_backend.c
index d7a4bbd91f..1e148398a2 100644
--- a/hw/rdma/rdma_backend.c
+++ b/hw/rdma/rdma_backend.c
@@ -338,6 +338,10 @@ void rdma_backend_post_send(RdmaBackendDev *backend_dev,
 if (qp_type == IBV_QPT_UD) {
 wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd,
 backend_dev->backend_gid_idx, dgid);
+if (!wr.wr.ud.ah) {
+comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
+goto out_dealloc_cqe_ctx;
+}
 wr.wr.ud.remote_qpn = dqpn;
 wr.wr.ud.remote_qkey = dqkey;
 }
-- 
2.17.2




[Qemu-devel] [PATCH v3 00/23] Add support for RDMA MAD

2018-11-12 Thread Yuval Shaia
Hi all.

This is a major enhancement to the pvrdma device to allow it to work with
state of the art applications such as MPI.

As described in patch #5, MAD packets are management packets that are used
for many purposes including but not limited to communication layer above IB
verbs API.

Patch 1 exposes new external executable (under contrib) that aims to
address a specific limitation in the RDMA usrespace MAD stack.

This patch-set mainly present MAD enhancement but during the work on it i
came across some bugs and enhancement needed to be implemented before doing
any MAD coding. This is the role of patches 2 to 4, 7 to 9 and 15 to 17.

Patches 6 and 18 are cosmetic changes while not relevant to this patchset
still introduce with it since (at least for 6) hard to decouple.

Patches 12 to 15 couple pvrdma device with vmxnet3 device as this is the
configuration enforced by pvrdma driver in guest - a vmxnet3 device in
function 0 and pvrdma device in function 1 in the same PCI slot. Patch 12
moves needed code from vmxnet3 device to a new header file that can be used
by pvrdma code while Patches 13 to 15 use of it.

Along with this patch-set there is a parallel patch posted to libvirt to
apply the change needed there as part of the process implemented in patches
10 and 11. This change is needed so that guest would be able to configure
any IP to the Ethernet function of the pvrdma device.
https://www.redhat.com/archives/libvir-list/2018-November/msg00135.html

Since we maintain external resources such as GIDs on host GID table we need
to do some cleanup before going down. This is the job of patches 19 and 20.
Patches 20 and 21 contain a fixes for bugs detected during the work on
processing cleanup code during shutdown.

v1 -> v2:
* Fix compilation issue detected when compiling for mingw
* Address comment from Eric Blake re version of QEMU in json
  message
* Fix example from QMP message in json file
* Fix case where a VM tries to remove an invalid GID from GID table
* rdmacm-mux: Cleanup entries in socket-gids table when socket is
  closed
* Cleanup resources (GIDs, QPs etc) when VM goes down

v2 -> v3:
* Address comment from Cornelia Huck for patch #19
* Add some R-Bs from Marcel Apfelbaum and Dmitry Fleytman
* Update docs/pvrdma.txt with the changes made by this patchset
* Address comments from Shamir Rabinovitch for UMAD multiplexer

Thanks,
Yuval

Yuval Shaia (23):
  contrib/rdmacm-mux: Add implementation of RDMA User MAD multiplexer
  hw/rdma: Add ability to force notification without re-arm
  hw/rdma: Return qpn 1 if ibqp is NULL
  hw/rdma: Abort send-op if fail to create addr handler
  hw/rdma: Add support for MAD packets
  hw/pvrdma: Make function reset_device return void
  hw/pvrdma: Make default pkey 0x
  hw/pvrdma: Set the correct opcode for recv completion
  hw/pvrdma: Set the correct opcode for send completion
  json: Define new QMP message for pvrdma
  hw/pvrdma: Add support to allow guest to configure GID table
  vmxnet3: Move some definitions to header file
  hw/pvrdma: Make sure PCI function 0 is vmxnet3
  hw/rdma: Initialize node_guid from vmxnet3 mac address
  hw/pvrdma: Make device state depend on Ethernet function state
  hw/pvrdma: Fill all CQE fields
  hw/pvrdma: Fill error code in command's response
  hw/rdma: Remove unneeded code that handles more that one port
  vl: Introduce shutdown_notifiers
  hw/pvrdma: Clean device's resource when system is shutdown
  hw/rdma: Do not use bitmap_zero_extend to free bitmap
  hw/rdma: Do not call rdma_backend_del_gid on an empty gid
  docs: Update pvrdma device documentation

 MAINTAINERS  |   2 +
 Makefile |   6 +-
 Makefile.objs|   5 +
 contrib/rdmacm-mux/Makefile.objs |   4 +
 contrib/rdmacm-mux/main.c| 771 +++
 contrib/rdmacm-mux/rdmacm-mux.h  |  56 +++
 docs/pvrdma.txt  | 103 -
 hw/net/vmxnet3.c | 116 +
 hw/net/vmxnet3_defs.h| 133 ++
 hw/rdma/rdma_backend.c   | 461 +++---
 hw/rdma/rdma_backend.h   |  28 +-
 hw/rdma/rdma_backend_defs.h  |  13 +-
 hw/rdma/rdma_rm.c| 120 -
 hw/rdma/rdma_rm.h|  17 +-
 hw/rdma/rdma_rm_defs.h   |  21 +-
 hw/rdma/rdma_utils.h |  24 +
 hw/rdma/vmw/pvrdma.h |  10 +-
 hw/rdma/vmw/pvrdma_cmd.c | 119 +++--
 hw/rdma/vmw/pvrdma_main.c|  49 +-
 hw/rdma/vmw/pvrdma_qp_ops.c  |  62 ++-
 include/sysemu/sysemu.h  |   1 +
 qapi/qapi-schema.json|   1 +
 qapi/rdma.json   |  38 ++
 vl.c |  15 +-
 24 files changed, 1868 insertions(+), 307 deletions(-)
 create mode 100644 contrib/rdmacm-mux/Makefile.objs
 create mode 100644 contrib/rdmacm-mux/main.c
 create mode 100644 contrib/rdmacm-mux/rdmacm-mux.h
 create mode 100644 hw/net/vmxnet3_defs.h
 create 

[Qemu-devel] [PATCH v3 02/23] hw/rdma: Add ability to force notification without re-arm

2018-11-12 Thread Yuval Shaia
Upon completion of incoming packet the device pushes CQE to driver's RX
ring and notify the driver (msix).
While for data-path incoming packets the driver needs the ability to
control whether it wished to receive interrupts or not, for control-path
packets such as incoming MAD the driver needs to be notified anyway, it
even do not need to re-arm the notification bit.

Enhance the notification field to support this.

Signed-off-by: Yuval Shaia 
Reviewed-by: Marcel Apfelbaum
---
 hw/rdma/rdma_rm.c   | 12 ++--
 hw/rdma/rdma_rm_defs.h  |  8 +++-
 hw/rdma/vmw/pvrdma_qp_ops.c |  6 --
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/hw/rdma/rdma_rm.c b/hw/rdma/rdma_rm.c
index 8d59a42cd1..4f10fcabcc 100644
--- a/hw/rdma/rdma_rm.c
+++ b/hw/rdma/rdma_rm.c
@@ -263,7 +263,7 @@ int rdma_rm_alloc_cq(RdmaDeviceResources *dev_res, 
RdmaBackendDev *backend_dev,
 }
 
 cq->opaque = opaque;
-cq->notify = false;
+cq->notify = CNT_CLEAR;
 
 rc = rdma_backend_create_cq(backend_dev, >backend_cq, cqe);
 if (rc) {
@@ -291,7 +291,10 @@ void rdma_rm_req_notify_cq(RdmaDeviceResources *dev_res, 
uint32_t cq_handle,
 return;
 }
 
-cq->notify = notify;
+if (cq->notify != CNT_SET) {
+cq->notify = notify ? CNT_ARM : CNT_CLEAR;
+}
+
 pr_dbg("notify=%d\n", cq->notify);
 }
 
@@ -349,6 +352,11 @@ int rdma_rm_alloc_qp(RdmaDeviceResources *dev_res, 
uint32_t pd_handle,
 return -EINVAL;
 }
 
+if (qp_type == IBV_QPT_GSI) {
+scq->notify = CNT_SET;
+rcq->notify = CNT_SET;
+}
+
 qp = res_tbl_alloc(_res->qp_tbl, _qpn);
 if (!qp) {
 return -ENOMEM;
diff --git a/hw/rdma/rdma_rm_defs.h b/hw/rdma/rdma_rm_defs.h
index 7228151239..9b399063d3 100644
--- a/hw/rdma/rdma_rm_defs.h
+++ b/hw/rdma/rdma_rm_defs.h
@@ -49,10 +49,16 @@ typedef struct RdmaRmPD {
 uint32_t ctx_handle;
 } RdmaRmPD;
 
+typedef enum CQNotificationType {
+CNT_CLEAR,
+CNT_ARM,
+CNT_SET,
+} CQNotificationType;
+
 typedef struct RdmaRmCQ {
 RdmaBackendCQ backend_cq;
 void *opaque;
-bool notify;
+CQNotificationType notify;
 } RdmaRmCQ;
 
 /* MR (DMA region) */
diff --git a/hw/rdma/vmw/pvrdma_qp_ops.c b/hw/rdma/vmw/pvrdma_qp_ops.c
index c668afd0ed..762700a205 100644
--- a/hw/rdma/vmw/pvrdma_qp_ops.c
+++ b/hw/rdma/vmw/pvrdma_qp_ops.c
@@ -89,8 +89,10 @@ static int pvrdma_post_cqe(PVRDMADev *dev, uint32_t 
cq_handle,
 pvrdma_ring_write_inc(>dsr_info.cq);
 
 pr_dbg("cq->notify=%d\n", cq->notify);
-if (cq->notify) {
-cq->notify = false;
+if (cq->notify != CNT_CLEAR) {
+if (cq->notify == CNT_ARM) {
+cq->notify = CNT_CLEAR;
+}
 post_interrupt(dev, INTR_VEC_CMD_COMPLETION_Q);
 }
 
-- 
2.17.2




[Qemu-devel] [PATCH] slirp: add tftp tracing

2018-11-12 Thread Gerd Hoffmann
Useful when debugging pxeboot, to see what the guest tries to do.

Signed-off-by: Gerd Hoffmann 
---
 Makefile.objs  | 1 +
 slirp/tftp.c   | 3 +++
 slirp/trace-events | 5 +
 3 files changed, 9 insertions(+)
 create mode 100644 slirp/trace-events

diff --git a/Makefile.objs b/Makefile.objs
index 1e1ff387d7..31852eaf8f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -251,6 +251,7 @@ trace-events-subdirs += net
 trace-events-subdirs += qapi
 trace-events-subdirs += qom
 trace-events-subdirs += scsi
+trace-events-subdirs += slirp
 trace-events-subdirs += target/arm
 trace-events-subdirs += target/i386
 trace-events-subdirs += target/mips
diff --git a/slirp/tftp.c b/slirp/tftp.c
index a9bc4bb1b6..735b57aa55 100644
--- a/slirp/tftp.c
+++ b/slirp/tftp.c
@@ -26,6 +26,7 @@
 #include "slirp.h"
 #include "qemu-common.h"
 #include "qemu/cutils.h"
+#include "trace.h"
 
 static inline int tftp_session_in_use(struct tftp_session *spt)
 {
@@ -204,6 +205,7 @@ static void tftp_send_error(struct tftp_session *spt,
   struct mbuf *m;
   struct tftp_t *tp;
 
+  trace_slirp_tftp_error(msg);
   m = m_get(spt->slirp);
 
   if (!m) {
@@ -323,6 +325,7 @@ static void tftp_handle_rrq(Slirp *slirp, struct 
sockaddr_storage *srcsas,
   break;
 }
   }
+  trace_slirp_tftp_rrq(req_fname);
 
   /* check mode */
   if ((pktlen - k) < 6) {
diff --git a/slirp/trace-events b/slirp/trace-events
new file mode 100644
index 00..ff8f656e8c
--- /dev/null
+++ b/slirp/trace-events
@@ -0,0 +1,5 @@
+# See docs/devel/tracing.txt for syntax documentation.
+
+# slirp/tftp.c
+slirp_tftp_rrq(const char *file) "file: %s"
+slirp_tftp_error(const char *file) "msg: %s"
-- 
2.9.3




Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Yu Zhang
On Tue, Nov 13, 2018 at 02:12:17PM +0800, Peter Xu wrote:
> On Tue, Nov 13, 2018 at 01:45:44PM +0800, Yu Zhang wrote:
> 
> [...]
> 
> > > > Since at it, another thing I thought about is making sure the IOMMU
> > > > capabilities will match between host and guest IOMMU, which I think
> > > > this series has ignorred so far.  E.g., when we're having assigned
> > > > devices in the guest and with 5-level IOVA, we should make sure the
> > > > host IOMMU supports 5-level as well before the guest starts since
> > > > otherwise the shadow page synchronization could potentially fail when
> > > > the requested IOVA address goes beyond 4-level.  One simple solution
> > > > is just to disable device assignment for now when we're with 57bits
> > > > vIOMMU but I'm not sure whether that's what you want, especially you
> > > > mentioned the DPDK case (who may use assigned devices).
> > > 
> > > Ok I totally forgot that we don't even support any kind of check like
> > > this before... So feel free to skip this comment if you want, or it
> > > would be even nicer if you want to fix it as a whole. :)
> > > 
> > 
> > Indeed. We have talked about this before. How about we focus on the 5-level
> > extension for now, and solve the check issue in the future? I still do not
> > have any clean solutions in mind. BTW, any suggestions on this issue? :)
> 
> I started to remember our discussions, sorry I should remember them
> earlier... :)
> 
> The only thing in my mind (I think I also suggested the same thing
> during that discussion, but I don't trust my memory any more...) is to
> use sysfs.  Say:
> 
>   1. Scan /sys/class/iommu/dmarN for all the host IOMMUs, read cap of
>  each IOMMU from /sys/class/iommu/dmar0/intel-iommu/cap,
> 
>   2. For each host iommu, scan /sys/class/iommu/dmarN/devices for all
>  the devices under each host IOMMU, then we can know which IOMMU
>  owns which device,
> 
>   3. For each assigned device to the guest, we lookup the previous
>  information to know the mgaw for each host device, raise error
>  and stop QEMU from booting if any of the host device has less
>  level supported than the guest vIOMMU (possibly some more checks
>  in vtd_iommu_notify_flag_changed)
> 
> (we still have some issue on vtd_iommu_notify_flag_changed since it's
>  only run until the first enablement of vIOMMU, so we'll only raise
>  the error during guest Linux boots with vIOMMU on. But that's another
>  issue)

Thanks for the explanation, Peter. You do have a better memory than I am.:)


> 
> Regards,
> 
> -- 
> Peter Xu
> 

B.R.
Yu



Re: [Qemu-devel] [PATCH v5 07/14] tests: Add bbc:microbit / nRF51 test suite

2018-11-12 Thread Thomas Huth
On 2018-11-12 22:42, Steffen Görtz wrote:
> The microbit-test includes tests for the nRF51 NVMC
> peripheral and will host future nRF51 peripheral tests
> and board-level bbc:microbit tests.
> 
> Signed-off-by: Steffen Görtz 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  tests/Makefile.include |   2 +
>  tests/microbit-test.c  | 133 +
>  2 files changed, 135 insertions(+)
>  create mode 100644 tests/microbit-test.c
> 
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index f77a495109..602346eeed 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -274,6 +274,7 @@ check-qtest-sparc64-y += tests/boot-serial-test$(EXESUF)
>  check-qtest-arm-y += tests/tmp105-test$(EXESUF)
>  check-qtest-arm-y += tests/pca9552-test$(EXESUF)
>  check-qtest-arm-y += tests/ds1338-test$(EXESUF)
> +check-qtest-arm-y += tests/microbit-test$(EXESUF)
>  check-qtest-arm-y += tests/m25p80-test$(EXESUF)
>  check-qtest-arm-y += tests/virtio-blk-test$(EXESUF)
>  check-qtest-arm-y += tests/test-arm-mptimer$(EXESUF)
> @@ -695,6 +696,7 @@ tests/pxe-test$(EXESUF): tests/pxe-test.o 
> tests/boot-sector.o $(libqos-obj-y)
>  tests/tmp105-test$(EXESUF): tests/tmp105-test.o $(libqos-omap-obj-y)
>  tests/pca9552-test$(EXESUF): tests/pca9552-test.o $(libqos-omap-obj-y)
>  tests/ds1338-test$(EXESUF): tests/ds1338-test.o $(libqos-imx-obj-y)
> +tests/microbit-test$(EXESUF): tests/microbit-test.o
>  tests/m25p80-test$(EXESUF): tests/m25p80-test.o
>  tests/i440fx-test$(EXESUF): tests/i440fx-test.o $(libqos-pc-obj-y)
>  tests/q35-test$(EXESUF): tests/q35-test.o $(libqos-pc-obj-y)
> diff --git a/tests/microbit-test.c b/tests/microbit-test.c
> new file mode 100644
> index 00..40b8b4bc64
> --- /dev/null
> +++ b/tests/microbit-test.c
> @@ -0,0 +1,133 @@
> + /*
> + * QTest testcase for Microbit board using the Nordic Semiconductor nRF51 
> SoC.
> + *
> + * nRF51:
> + * Reference Manual: http://infocenter.nordicsemi.com/pdf/nRF51_RM_v3.0.pdf
> + * Product Spec: http://infocenter.nordicsemi.com/pdf/nRF51822_PS_v3.1.pdf
> + *
> + * Microbit Board: http://microbit.org/
> + *
> + * Copyright 2018 Steffen Görtz 
> + *
> + * This code is licensed under the GPL version 2 or later.  See
> + * the COPYING file in the top-level directory.
> + */
> +
> +
> +#include "qemu/osdep.h"
> +#include "exec/hwaddr.h"
> +#include "libqtest.h"
> +
> +#include "hw/arm/nrf51.h"
> +#include "hw/nvram/nrf51_nvm.h"
> +
> +#define FLASH_SIZE  (256 * NRF51_PAGE_SIZE)
> +
> +static void fill_and_erase(hwaddr base, hwaddr size, uint32_t address_reg)
> +{
> +hwaddr i;
> +
> +/* Erase Page */
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x02);
> +writel(NRF51_NVMC_BASE + address_reg, base);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +
> +/* Check memory */
> +for (i = 0; i < size / 4; i++) {
> +g_assert_cmpuint(readl(base + i * 4), ==, 0x);
> +}
> +
> +/* Fill memory */
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x01);
> +for (i = 0; i < size / 4; i++) {
> +writel(base + i * 4, i);
> +g_assert_cmpuint(readl(base + i * 4), ==, i);
> +}
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +}
> +
> +static void test_nrf51_nvmc(void)
> +{
> +uint32_t value;
> +hwaddr i;
> +
> +/* Test always ready */
> +value = readl(NRF51_NVMC_BASE + NRF51_NVMC_READY);
> +g_assert_cmpuint(value & 0x01, ==, 0x01);
> +
> +/* Test write-read config register */
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x03);
> +g_assert_cmpuint(readl(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG), ==, 0x03);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +g_assert_cmpuint(readl(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG), ==, 0x00);
> +
> +/* Test PCR0 */
> +fill_and_erase(NRF51_FLASH_BASE, NRF51_PAGE_SIZE, NRF51_NVMC_ERASEPCR0);
> +fill_and_erase(NRF51_FLASH_BASE + NRF51_PAGE_SIZE,
> +   NRF51_PAGE_SIZE, NRF51_NVMC_ERASEPCR0);
> +
> +/* Test PCR1 */
> +fill_and_erase(NRF51_FLASH_BASE, NRF51_PAGE_SIZE, NRF51_NVMC_ERASEPCR1);
> +fill_and_erase(NRF51_FLASH_BASE + NRF51_PAGE_SIZE,
> +   NRF51_PAGE_SIZE, NRF51_NVMC_ERASEPCR1);
> +
> +/* Erase all */
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x02);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_ERASEALL, 0x01);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x01);
> +for (i = 0; i < FLASH_SIZE / 4; i++) {
> +writel(NRF51_FLASH_BASE + i * 4, i);
> +g_assert_cmpuint(readl(NRF51_FLASH_BASE + i * 4), ==, i);
> +}
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x02);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_ERASEALL, 0x01);
> +writel(NRF51_NVMC_BASE + NRF51_NVMC_CONFIG, 0x00);
> +
> +for (i = 0; i < FLASH_SIZE / 4; i++) {
> +

Re: [Qemu-devel] [PATCH v5 01/14] qtest: Add set_irq_in command to set IRQ/GPIO level

2018-11-12 Thread Thomas Huth
On 2018-11-12 22:42, Steffen Görtz wrote:
> Adds a new qtest command "set_irq_in" which allows
> to set qemu gpio lines to a given level.
> 
> Based on https://lists.gnu.org/archive/html/qemu-devel/2012-12/msg02363.html
> which never got merged.
> 
> Signed-off-by: Steffen Görtz 
> Originally-by: Matthew Ogilvie 
> Reviewed-by: Stefan Hajnoczi 
> ---
>  qtest.c  | 43 +++
>  tests/libqtest.c | 10 ++
>  tests/libqtest.h | 13 +
>  3 files changed, 66 insertions(+)

Reviewed-by: Thomas Huth 



Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Peter Xu
On Tue, Nov 13, 2018 at 01:45:44PM +0800, Yu Zhang wrote:

[...]

> > > Since at it, another thing I thought about is making sure the IOMMU
> > > capabilities will match between host and guest IOMMU, which I think
> > > this series has ignorred so far.  E.g., when we're having assigned
> > > devices in the guest and with 5-level IOVA, we should make sure the
> > > host IOMMU supports 5-level as well before the guest starts since
> > > otherwise the shadow page synchronization could potentially fail when
> > > the requested IOVA address goes beyond 4-level.  One simple solution
> > > is just to disable device assignment for now when we're with 57bits
> > > vIOMMU but I'm not sure whether that's what you want, especially you
> > > mentioned the DPDK case (who may use assigned devices).
> > 
> > Ok I totally forgot that we don't even support any kind of check like
> > this before... So feel free to skip this comment if you want, or it
> > would be even nicer if you want to fix it as a whole. :)
> > 
> 
> Indeed. We have talked about this before. How about we focus on the 5-level
> extension for now, and solve the check issue in the future? I still do not
> have any clean solutions in mind. BTW, any suggestions on this issue? :)

I started to remember our discussions, sorry I should remember them
earlier... :)

The only thing in my mind (I think I also suggested the same thing
during that discussion, but I don't trust my memory any more...) is to
use sysfs.  Say:

  1. Scan /sys/class/iommu/dmarN for all the host IOMMUs, read cap of
 each IOMMU from /sys/class/iommu/dmar0/intel-iommu/cap,

  2. For each host iommu, scan /sys/class/iommu/dmarN/devices for all
 the devices under each host IOMMU, then we can know which IOMMU
 owns which device,

  3. For each assigned device to the guest, we lookup the previous
 information to know the mgaw for each host device, raise error
 and stop QEMU from booting if any of the host device has less
 level supported than the guest vIOMMU (possibly some more checks
 in vtd_iommu_notify_flag_changed)

(we still have some issue on vtd_iommu_notify_flag_changed since it's
 only run until the first enablement of vIOMMU, so we'll only raise
 the error during guest Linux boots with vIOMMU on. But that's another
 issue)

Regards,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v1 3/3] intel-iommu: search iotlb for levels supported by the address width.

2018-11-12 Thread Yu Zhang
On Tue, Nov 13, 2018 at 01:18:54PM +0800, Peter Xu wrote:
> On Mon, Nov 12, 2018 at 08:38:30PM +0800, Yu Zhang wrote:
> > On Mon, Nov 12, 2018 at 05:36:38PM +0800, Peter Xu wrote:
> > > On Mon, Nov 12, 2018 at 05:25:48PM +0800, Yu Zhang wrote:
> > > > On Mon, Nov 12, 2018 at 04:51:22PM +0800, Peter Xu wrote:
> > > > > On Fri, Nov 09, 2018 at 07:49:47PM +0800, Yu Zhang wrote:
> > > > > > This patch updates vtd_lookup_iotlb() to search cached mappings only
> > > > > > for all page levels supported by address width of current vIOMMU. 
> > > > > > Also,
> > > > > > to cover 57-bit width, the shift of source id(VTD_IOTLB_SID_SHIFT) 
> > > > > > and
> > > > > > of page level(VTD_IOTLB_LVL_SHIFT) are enlarged by 9 - the stride of
> > > > > > one paging structure level.
> > > > > > 
> > > > > > Signed-off-by: Yu Zhang 
> > > > > > ---
> > > > > > Cc: "Michael S. Tsirkin" 
> > > > > > Cc: Marcel Apfelbaum 
> > > > > > Cc: Paolo Bonzini  
> > > > > > Cc: Richard Henderson  
> > > > > > Cc: Eduardo Habkost 
> > > > > > Cc: Peter Xu 
> > > > > > ---
> > > > > >  hw/i386/intel_iommu.c  | 5 +++--
> > > > > >  hw/i386/intel_iommu_internal.h | 7 ++-
> > > > > >  2 files changed, 5 insertions(+), 7 deletions(-)
> > > > > > 
> > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > > index 9cdf755..ce7e17e 100644
> > > > > > --- a/hw/i386/intel_iommu.c
> > > > > > +++ b/hw/i386/intel_iommu.c
> > > > > > @@ -254,11 +254,12 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr 
> > > > > > addr, uint32_t level)
> > > > > >  static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, 
> > > > > > uint16_t source_id,
> > > > > > hwaddr addr)
> > > > > >  {
> > > > > > -VTDIOTLBEntry *entry;
> > > > > > +VTDIOTLBEntry *entry = NULL;
> > > > > >  uint64_t key;
> > > > > >  int level;
> > > > > > +int max_level = (s->aw_bits - VTD_PAGE_SHIFT_4K) / 
> > > > > > VTD_SL_LEVEL_BITS;
> > > > > >  
> > > > > > -for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; 
> > > > > > level++) {
> > > > > > +for (level = VTD_SL_PT_LEVEL; level < max_level; level++) {
> > > > > 
> > > > > My understanding of current IOTLB is that it only caches the last
> > > > > level of mapping, say:
> > > > > 
> > > > >   - level 1: 4K page
> > > > >   - level 2: 2M page
> > > > >   - level 3: 1G page
> > > > > 
> > > > > So we don't check against level=4 even if x-aw-bits=48 is specified.
> > > > > 
> > > > > Here does it mean that we're going to have... 512G iommu huge pages?
> > > > > 
> > > > 
> > > > No. My bad, I misunderstood this routine. And now I believe we do not
> > > > need this patch. :-)
> > > 
> > > Yeah good to confirm that :-)
> > 
> > Sorry, Peter. I still have question about this part. I agree we do not need
> > to do the extra loop - therefore no need for the max_level part introduced
> > in this patch.
> > 
> > But as to modification of VTD_IOTLB_SID_SHIFT/VTD_IOTLB_LVL_SHIFT, we may
> > still need to do it due to the enlarged gfn, to search an IOTLB entry for
> > a 4K mapping, the pfn itself could be as large as 45-bit.
> 
> Agreed.

Thanks~

> 
> > 
> > Besides, currently vtd_get_iotlb_gfn() is just shifting 12 bits for all
> > different levels, is this necessary? I mean, how about we do the shift
> > based on current level?
> > 
> >  static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
> >  {
> > -return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
> > +uint32_t shift = vtd_slpt_level_shift(level);
> > +return (addr & vtd_slpt_level_page_mask(level)) >> shift;
> >  }
> 
> IMHO we can, but I don't see much gain from it.
> 
> If we shift, we still need to use the maximum possible bits that a PFN
> can hold, which is 45bits (when with 4k pages), so we can't gain
> anything out if it (no saved bits on iotlb key).  Instead, we'll need
> to call more vtd_slpt_level_shift() for each vtd_get_iotlb_gfn() which
> even seems a bit slower.

Yep, we still need to use 45 bits for 4K mappings. The only benifit I can think
of is it's more intuitive - more aligned to the vtd spec of iotlb tags. But just
like you said, I do not see any runtime gain in it. So I'm fine to drop this. :)

> 
> Regards,
> 
> -- 
> Peter Xu
> 

B.R.
Yu



Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Yu Zhang
On Tue, Nov 13, 2018 at 01:04:51PM +0800, Peter Xu wrote:
> On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> > On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > > A 5-level paging capable VM may choose to use 57-bit IOVA address 
> > > > > width.
> > > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > > performing VFIO map/unmap operations, to avoid the burden of managing 
> > > > > the
> > > > > IOVA space.
> > > > 
> > > > Since you mentioned about DPDK... I'm just curious that whether have
> > > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > > mode running in the guest? That would be something nice to mention in
> > > > the cover letter if you have.
> > > > 
> > > 
> > > Hah. Maybe I shall not mention DPDK here. 
> > > 
> > > The story is that we heard the requirement, saying applications like DPDK
> > > would need 5-level paging in IOMMU side. And I was convinced after checked
> > > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > > patch with DPDK.
> > > 
> > > Instead, I used kvm-unit-test to verify this patch series. And of course, 
> > > I
> > > also did some modification to the test case. Patch for the test also sent 
> > > out
> > > at https://www.spinics.net/lists/kvm/msg177425.html.
> > 
> > Yeah that's perfectly fine for me.  So instead maybe you can also
> > mention the kvm-unit-test in the cover letter if you gonna repost.
> > 
> > > 
> > > > [...]
> > > > 
> > > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState 
> > > > > *s, Error **errp)
> > > > >  }
> > > > >  }
> > > > >  
> > > > > -/* Currently only address widths supported are 39 and 48 bits */
> > > > > +/* Currently address widths supported are 39, 48, and 57 bits */
> > > > >  if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > > -(s->aw_bits != VTD_AW_48BIT)) {
> > > > > -error_setg(errp, "Supported values for x-aw-bits are: %d, 
> > > > > %d",
> > > > > -   VTD_AW_39BIT, VTD_AW_48BIT);
> > > > > +(s->aw_bits != VTD_AW_48BIT) &&
> > > > > +(s->aw_bits != VTD_AW_57BIT)) {
> > > > > +error_setg(errp, "Supported values for x-aw-bits are: %d, 
> > > > > %d, %d",
> > > > > +   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > > +return false;
> > > > > +}
> > > > > +
> > > > > +if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > > +!(host_has_la57() && guest_has_la57())) {
> > > > > +error_setg(errp, "Do not support 57-bit DMA address, unless 
> > > > > both "
> > > > > + "host and guest are capable of 5-level 
> > > > > paging.\n");
> > > > 
> > > > Is there any context (or pointer to previous discussions would work
> > > > too) on explaining why we don't support some scenarios like
> > > > host_paw=48,guest_paw=48,guest_gaw=57?
> > > > 
> > > 
> > > Well, above check is only to make sure both the host and the guest can
> > > use 57bit linear address, which requires 5-level paging. So I believe
> > > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > > The guest_has_la57() means the guest can use 57-bit linear address,
> > > regardless of its physical address width.
> > 
> > Sorry for my incorrect wording.  I mean when host/guest CPU only
> > support 4-level LA then would/should we allow the guest IOMMU to
> > support 5-level IOVA?  Asked since I'm thinking whether I can run the
> > series a bit with my laptop/servers.
> 
> [...]
> 
> > 
> > Since at it, another thing I thought about is making sure the IOMMU
> > capabilities will match between host and guest IOMMU, which I think
> > this series has ignorred so far.  E.g., when we're having assigned
> > devices in the guest and with 5-level IOVA, we should make sure the
> > host IOMMU supports 5-level as well before the guest starts since
> > otherwise the shadow page synchronization could potentially fail when
> > the requested IOVA address goes beyond 4-level.  One simple solution
> > is just to disable device assignment for now when we're with 57bits
> > vIOMMU but I'm not sure whether that's what you want, especially you
> > mentioned the DPDK case (who may use assigned devices).
> 
> Ok I totally forgot that we don't even support any kind of check like
> this before... So feel free to skip this comment if you want, or it
> would be even nicer if you want to fix it as a whole. :)
> 

Indeed. We have talked about this before. How about we focus on the 5-level
extension for now, and solve the check issue in the future? I still do not
have any clean solutions in mind. BTW, any suggestions on this issue? :)

> Regards,
> 
> -- 
> Peter Xu
> 

B.R.
Yu



Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Yu Zhang
On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > performing VFIO map/unmap operations, to avoid the burden of managing 
> > > > the
> > > > IOVA space.
> > > 
> > > Since you mentioned about DPDK... I'm just curious that whether have
> > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > mode running in the guest? That would be something nice to mention in
> > > the cover letter if you have.
> > > 
> > 
> > Hah. Maybe I shall not mention DPDK here. 
> > 
> > The story is that we heard the requirement, saying applications like DPDK
> > would need 5-level paging in IOMMU side. And I was convinced after checked
> > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > patch with DPDK.
> > 
> > Instead, I used kvm-unit-test to verify this patch series. And of course, I
> > also did some modification to the test case. Patch for the test also sent 
> > out
> > at https://www.spinics.net/lists/kvm/msg177425.html.
> 
> Yeah that's perfectly fine for me.  So instead maybe you can also
> mention the kvm-unit-test in the cover letter if you gonna repost.

Got it. Thanks!

> 
> > 
> > > [...]
> > > 
> > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState 
> > > > *s, Error **errp)
> > > >  }
> > > >  }
> > > >  
> > > > -/* Currently only address widths supported are 39 and 48 bits */
> > > > +/* Currently address widths supported are 39, 48, and 57 bits */
> > > >  if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > -(s->aw_bits != VTD_AW_48BIT)) {
> > > > -error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > > -   VTD_AW_39BIT, VTD_AW_48BIT);
> > > > +(s->aw_bits != VTD_AW_48BIT) &&
> > > > +(s->aw_bits != VTD_AW_57BIT)) {
> > > > +error_setg(errp, "Supported values for x-aw-bits are: %d, %d, 
> > > > %d",
> > > > +   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > +return false;
> > > > +}
> > > > +
> > > > +if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > +!(host_has_la57() && guest_has_la57())) {
> > > > +error_setg(errp, "Do not support 57-bit DMA address, unless 
> > > > both "
> > > > + "host and guest are capable of 5-level 
> > > > paging.\n");
> > > 
> > > Is there any context (or pointer to previous discussions would work
> > > too) on explaining why we don't support some scenarios like
> > > host_paw=48,guest_paw=48,guest_gaw=57?
> > > 
> > 
> > Well, above check is only to make sure both the host and the guest can
> > use 57bit linear address, which requires 5-level paging. So I believe
> > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > The guest_has_la57() means the guest can use 57-bit linear address,
> > regardless of its physical address width.
> 
> Sorry for my incorrect wording.  I mean when host/guest CPU only
> support 4-level LA then would/should we allow the guest IOMMU to
> support 5-level IOVA?  Asked since I'm thinking whether I can run the
> series a bit with my laptop/servers.

Well, by "only support", I guess you mean the hardware capability, instead
of its paging mode. So I do not think hardware will support 5-level IOVA for
platforms without 5-level VA. Therefore a 5-level vIOMMU is disallowed here. :)

> 
> Since at it, another thing I thought about is making sure the IOMMU
> capabilities will match between host and guest IOMMU, which I think
> this series has ignorred so far.  E.g., when we're having assigned
> devices in the guest and with 5-level IOVA, we should make sure the
> host IOMMU supports 5-level as well before the guest starts since
> otherwise the shadow page synchronization could potentially fail when
> the requested IOVA address goes beyond 4-level.  One simple solution
> is just to disable device assignment for now when we're with 57bits
> vIOMMU but I'm not sure whether that's what you want, especially you
> mentioned the DPDK case (who may use assigned devices).
> 

Thanks, Peter. Replied in the following up mail. :)

> (sorry to have mentioned the dpdk case again :)
> 
> Regards,
> 
> -- 
> Peter Xu
> 

B.R.
Yu



Re: [Qemu-devel] [Qemu-ppc] [PATCH qemu] ppc/spapr: Receive and store device tree blob from SLOF

2018-11-12 Thread Alexey Kardashevskiy



On 12/11/2018 20:05, Greg Kurz wrote:
> On Mon, 12 Nov 2018 15:12:26 +1100
> Alexey Kardashevskiy  wrote:
> 
>> On 12/11/2018 05:10, Greg Kurz wrote:
>>> Hi Alexey,
>>>
>>> Just a few remarks. See below.
>>>
>>> On Thu,  8 Nov 2018 12:44:06 +1100
>>> Alexey Kardashevskiy  wrote:
>>>   
 SLOF receives a device tree and updates it with various properties
 before switching to the guest kernel and QEMU is not aware of any changes
 made by SLOF. Since there is no real RTAS (QEMU implements it), it makes
 sense to pass the SLOF final device tree to QEMU to let it implement
 RTAS related tasks better, such as PCI host bus adapter hotplug.

 Specifially, now QEMU can find out the actual XICS phandle (for PHB
 hotplug) and the RTAS linux,rtas-entry/base properties (for firmware
 assisted NMI - FWNMI).

 This stores the initial DT blob in the sPAPR machine and replaces it
 in the KVMPPC_H_UPDATE_DT (new private hypercall) handler.

 This adds an @update_dt_enabled machine property to allow backward
 migration.

 SLOF already has a hypercall since
 https://github.com/aik/SLOF/commit/e6fc84652c9c0073f9183

 Signed-off-by: Alexey Kardashevskiy 
 ---
  include/hw/ppc/spapr.h |  7 ++-
  hw/ppc/spapr.c | 29 -
  hw/ppc/spapr_hcall.c   | 32 
  hw/ppc/trace-events|  2 ++
  4 files changed, 68 insertions(+), 2 deletions(-)

 diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
 index ad4d7cfd97..f5dcaf44cb 100644
 --- a/include/hw/ppc/spapr.h
 +++ b/include/hw/ppc/spapr.h
 @@ -100,6 +100,7 @@ struct sPAPRMachineClass {
  
  /*< public >*/
  bool dr_lmb_enabled;   /* enable dynamic-reconfig/hotplug of LMBs 
 */
 +bool update_dt_enabled;/* enable KVMPPC_H_UPDATE_DT */
  bool use_ohci_by_default;  /* use USB-OHCI instead of XHCI */
  bool pre_2_10_has_unused_icps;
  bool legacy_irq_allocation;
 @@ -136,6 +137,9 @@ struct sPAPRMachineState {
  int vrma_adjust;
  ssize_t rtas_size;
  void *rtas_blob;
 +uint32_t fdt_size;
 +uint32_t fdt_initial_size;  
>>>
>>> I don't quite see the purpose of fdt_initial_size... it seems to be only
>>> used to print a trace.  
>>
>>
>> Ah, lost in rebase. The purpose was to test if the new device tree has
>> not grown too much.
>>
> 
> Ok, makes sense during development.
> 
>>
>>
>>>   
 +void *fdt_blob;
  long kernel_size;
  bool kernel_le;
  uint32_t initrd_base;
 @@ -462,7 +466,8 @@ struct sPAPRMachineState {
  #define KVMPPC_H_LOGICAL_MEMOP  (KVMPPC_HCALL_BASE + 0x1)
  /* Client Architecture support */
  #define KVMPPC_H_CAS(KVMPPC_HCALL_BASE + 0x2)
 -#define KVMPPC_HCALL_MAXKVMPPC_H_CAS
 +#define KVMPPC_H_UPDATE_DT  (KVMPPC_HCALL_BASE + 0x3)
 +#define KVMPPC_HCALL_MAXKVMPPC_H_UPDATE_DT
  
  typedef struct sPAPRDeviceTreeUpdateHeader {
  uint32_t version_id;
 diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
 index c08130facb..5e2d4d211c 100644
 --- a/hw/ppc/spapr.c
 +++ b/hw/ppc/spapr.c
 @@ -1633,7 +1633,10 @@ static void spapr_machine_reset(void)
  /* Load the fdt */
  qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt));
  cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt));
 -g_free(fdt);
 +g_free(spapr->fdt_blob);
 +spapr->fdt_size = fdt_totalsize(fdt);
 +spapr->fdt_initial_size = spapr->fdt_size;
 +spapr->fdt_blob = fdt;  
>>>
>>> Hmm... It looks weird to store state in a reset handler. I'd rather zeroe
>>> both fdt_blob and fdt_size here.  
>>
>>
>> The device tree is built from the reset handler and the idea is that we
>> want to always have some tree in the machine.
>>
> 
> Yes of course, I forgot that we need to keep the fdt to be kept
> somewhere so that we can use it :). My remark has more to do
> with migration actually: the fdt built at reset time is supposed
> to derive from the command line and hot-(un)plugged devices, ie,
> identical in source and destination. This isn't state we should
> migrate IIUC.

Having some device tree all the time seems more convenient than managing
the state when we do have one and when we do not.

It is not a big deal though, I'd wait and see what David thinks. Thanks,



> Maybe add a boolean field that tells that the fdt was updated, use
> it in spapr_dtb_needed() and reset it in spapr_machine_reset() ?
> 
>>
>>
>>>   
  
  /* Set up the entry state */
  spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, fdt_addr);
 @@ -1887,6 +1890,27 @@ static const VMStateDescription 
 vmstate_spapr_irq_map = {
  },
  };
  
 +static bool spapr_dtb_needed(void *opaque)
 +{
 +sPAPRMachineClass 

Re: [Qemu-devel] [PATCH v1 3/3] intel-iommu: search iotlb for levels supported by the address width.

2018-11-12 Thread Peter Xu
On Mon, Nov 12, 2018 at 08:38:30PM +0800, Yu Zhang wrote:
> On Mon, Nov 12, 2018 at 05:36:38PM +0800, Peter Xu wrote:
> > On Mon, Nov 12, 2018 at 05:25:48PM +0800, Yu Zhang wrote:
> > > On Mon, Nov 12, 2018 at 04:51:22PM +0800, Peter Xu wrote:
> > > > On Fri, Nov 09, 2018 at 07:49:47PM +0800, Yu Zhang wrote:
> > > > > This patch updates vtd_lookup_iotlb() to search cached mappings only
> > > > > for all page levels supported by address width of current vIOMMU. 
> > > > > Also,
> > > > > to cover 57-bit width, the shift of source id(VTD_IOTLB_SID_SHIFT) and
> > > > > of page level(VTD_IOTLB_LVL_SHIFT) are enlarged by 9 - the stride of
> > > > > one paging structure level.
> > > > > 
> > > > > Signed-off-by: Yu Zhang 
> > > > > ---
> > > > > Cc: "Michael S. Tsirkin" 
> > > > > Cc: Marcel Apfelbaum 
> > > > > Cc: Paolo Bonzini  
> > > > > Cc: Richard Henderson  
> > > > > Cc: Eduardo Habkost 
> > > > > Cc: Peter Xu 
> > > > > ---
> > > > >  hw/i386/intel_iommu.c  | 5 +++--
> > > > >  hw/i386/intel_iommu_internal.h | 7 ++-
> > > > >  2 files changed, 5 insertions(+), 7 deletions(-)
> > > > > 
> > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > index 9cdf755..ce7e17e 100644
> > > > > --- a/hw/i386/intel_iommu.c
> > > > > +++ b/hw/i386/intel_iommu.c
> > > > > @@ -254,11 +254,12 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, 
> > > > > uint32_t level)
> > > > >  static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t 
> > > > > source_id,
> > > > > hwaddr addr)
> > > > >  {
> > > > > -VTDIOTLBEntry *entry;
> > > > > +VTDIOTLBEntry *entry = NULL;
> > > > >  uint64_t key;
> > > > >  int level;
> > > > > +int max_level = (s->aw_bits - VTD_PAGE_SHIFT_4K) / 
> > > > > VTD_SL_LEVEL_BITS;
> > > > >  
> > > > > -for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; 
> > > > > level++) {
> > > > > +for (level = VTD_SL_PT_LEVEL; level < max_level; level++) {
> > > > 
> > > > My understanding of current IOTLB is that it only caches the last
> > > > level of mapping, say:
> > > > 
> > > >   - level 1: 4K page
> > > >   - level 2: 2M page
> > > >   - level 3: 1G page
> > > > 
> > > > So we don't check against level=4 even if x-aw-bits=48 is specified.
> > > > 
> > > > Here does it mean that we're going to have... 512G iommu huge pages?
> > > > 
> > > 
> > > No. My bad, I misunderstood this routine. And now I believe we do not
> > > need this patch. :-)
> > 
> > Yeah good to confirm that :-)
> 
> Sorry, Peter. I still have question about this part. I agree we do not need
> to do the extra loop - therefore no need for the max_level part introduced
> in this patch.
> 
> But as to modification of VTD_IOTLB_SID_SHIFT/VTD_IOTLB_LVL_SHIFT, we may
> still need to do it due to the enlarged gfn, to search an IOTLB entry for
> a 4K mapping, the pfn itself could be as large as 45-bit.

Agreed.

> 
> Besides, currently vtd_get_iotlb_gfn() is just shifting 12 bits for all
> different levels, is this necessary? I mean, how about we do the shift
> based on current level?
> 
>  static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
>  {
> -return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
> +uint32_t shift = vtd_slpt_level_shift(level);
> +return (addr & vtd_slpt_level_page_mask(level)) >> shift;
>  }

IMHO we can, but I don't see much gain from it.

If we shift, we still need to use the maximum possible bits that a PFN
can hold, which is 45bits (when with 4k pages), so we can't gain
anything out if it (no saved bits on iotlb key).  Instead, we'll need
to call more vtd_slpt_level_shift() for each vtd_get_iotlb_gfn() which
even seems a bit slower.

Regards,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Peter Xu
On Tue, Nov 13, 2018 at 11:37:07AM +0800, Peter Xu wrote:
> On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> > On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > > performing VFIO map/unmap operations, to avoid the burden of managing 
> > > > the
> > > > IOVA space.
> > > 
> > > Since you mentioned about DPDK... I'm just curious that whether have
> > > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > > mode running in the guest? That would be something nice to mention in
> > > the cover letter if you have.
> > > 
> > 
> > Hah. Maybe I shall not mention DPDK here. 
> > 
> > The story is that we heard the requirement, saying applications like DPDK
> > would need 5-level paging in IOMMU side. And I was convinced after checked
> > DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> > patch with DPDK.
> > 
> > Instead, I used kvm-unit-test to verify this patch series. And of course, I
> > also did some modification to the test case. Patch for the test also sent 
> > out
> > at https://www.spinics.net/lists/kvm/msg177425.html.
> 
> Yeah that's perfectly fine for me.  So instead maybe you can also
> mention the kvm-unit-test in the cover letter if you gonna repost.
> 
> > 
> > > [...]
> > > 
> > > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState 
> > > > *s, Error **errp)
> > > >  }
> > > >  }
> > > >  
> > > > -/* Currently only address widths supported are 39 and 48 bits */
> > > > +/* Currently address widths supported are 39, 48, and 57 bits */
> > > >  if ((s->aw_bits != VTD_AW_39BIT) &&
> > > > -(s->aw_bits != VTD_AW_48BIT)) {
> > > > -error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > > -   VTD_AW_39BIT, VTD_AW_48BIT);
> > > > +(s->aw_bits != VTD_AW_48BIT) &&
> > > > +(s->aw_bits != VTD_AW_57BIT)) {
> > > > +error_setg(errp, "Supported values for x-aw-bits are: %d, %d, 
> > > > %d",
> > > > +   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > > +return false;
> > > > +}
> > > > +
> > > > +if ((s->aw_bits == VTD_AW_57BIT) &&
> > > > +!(host_has_la57() && guest_has_la57())) {
> > > > +error_setg(errp, "Do not support 57-bit DMA address, unless 
> > > > both "
> > > > + "host and guest are capable of 5-level 
> > > > paging.\n");
> > > 
> > > Is there any context (or pointer to previous discussions would work
> > > too) on explaining why we don't support some scenarios like
> > > host_paw=48,guest_paw=48,guest_gaw=57?
> > > 
> > 
> > Well, above check is only to make sure both the host and the guest can
> > use 57bit linear address, which requires 5-level paging. So I believe
> > we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> > The guest_has_la57() means the guest can use 57-bit linear address,
> > regardless of its physical address width.
> 
> Sorry for my incorrect wording.  I mean when host/guest CPU only
> support 4-level LA then would/should we allow the guest IOMMU to
> support 5-level IOVA?  Asked since I'm thinking whether I can run the
> series a bit with my laptop/servers.

[...]

> 
> Since at it, another thing I thought about is making sure the IOMMU
> capabilities will match between host and guest IOMMU, which I think
> this series has ignorred so far.  E.g., when we're having assigned
> devices in the guest and with 5-level IOVA, we should make sure the
> host IOMMU supports 5-level as well before the guest starts since
> otherwise the shadow page synchronization could potentially fail when
> the requested IOVA address goes beyond 4-level.  One simple solution
> is just to disable device assignment for now when we're with 57bits
> vIOMMU but I'm not sure whether that's what you want, especially you
> mentioned the DPDK case (who may use assigned devices).

Ok I totally forgot that we don't even support any kind of check like
this before... So feel free to skip this comment if you want, or it
would be even nicer if you want to fix it as a whole. :)

Regards,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH] virtio-net: support RSC v4/v6 tcp traffic for Windows HCK

2018-11-12 Thread Wei Xu
Looks good, I can't recall the status of last version well but
I remember Jason gave some comments about sanity check are quiet
essential, have you addressed them?

Reviewed by: Wei Xu 

On Fri, Nov 09, 2018 at 04:58:27PM +0200, Yuri Benditovich wrote:
> This commit adds implementation of RX packets
> coalescing, compatible with requirements of Windows
> Hardware compatibility kit.
> 
> The device enables feature VIRTIO_NET_F_RSC_EXT in
> host features if it supports extended RSC functionality
> as defined in the specification.
> This feature requires at least one of VIRTIO_NET_F_GUEST_TSO4,
> VIRTIO_NET_F_GUEST_TSO6. Windows guest driver acks
> this feature only if VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
> is also present.
> 
> In case vhost is enabled the feature bit is cleared in
> host_features during device initialization.
> 
> If the guest driver acks VIRTIO_NET_F_RSC_EXT feature,
> the device coalesces TCPv4 and TCPv6 packets (if
> respective VIRTIO_NET_F_GUEST_TSO feature is on,
> populates extended RSC information in virtio header
> and sets VIRTIO_NET_HDR_F_RSC_INFO bit in header flags.
> The device does not recalculate checksums in the coalesced
> packet, so they are not valid.
> 
> In this case:
> All the data packets in a tcp connection are cached
> to a single buffer in every receive interval, and will
> be sent out via a timer, the 'virtio_net_rsc_timeout'
> controls the interval, this value may impact the
> performance and response time of tcp connection,
> 5(50us) is an experience value to gain a performance
> improvement, since the whql test sends packets every 100us,
> so '30(300us)' passes the test case, it is the default
> value as well, tune it via the command line parameter
> 'rsc_interval' within 'virtio-net-pci' device, for example,
> to launch a guest with interval set as '50':
> 
> 'virtio-net-pci,netdev=hostnet1,bus=pci.0,id=net1,mac=00,
> guest_rsc_ext=on,rsc_interval=50'
> 
> The timer will only be triggered if the packets pool is not empty,
> and it'll drain off all the cached packets.
> 
> 'NetRscChain' is used to save the segments of IPv4/6 in a
> VirtIONet device.
> 
> A new segment becomes a 'Candidate' as well as it passed sanity check,
> the main handler of TCP includes TCP window update, duplicated
> ACK check and the real data coalescing.
> 
> An 'Candidate' segment means:
> 1. Segment is within current window and the sequence is the expected one.
> 2. 'ACK' of the segment is in the valid window.
> 
> Sanity check includes:
> 1. Incorrect version in IP header
> 2. An IP options or IP fragment
> 3. Not a TCP packet
> 4. Sanity size check to prevent buffer overflow attack.
> 5. An ECN packet
> 
> Even though, there might more cases should be considered such as
> ip identification other flags, while it breaks the test because
> windows set it to the same even it's not a fragment.
> 
> Normally it includes 2 typical ways to handle a TCP control flag,
> 'bypass' and 'finalize', 'bypass' means should be sent out directly,
> while 'finalize' means the packets should also be bypassed, but this
> should be done after search for the same connection packets in the
> pool and drain all of them out, this is to avoid out of order fragment.
> 
> All the 'SYN' packets will be bypassed since this always begin a new'
> connection, other flags such 'URG/FIN/RST/CWR/ECE' will trigger a
> finalization, because this normally happens upon a connection is going
> to be closed, an 'URG' packet also finalize current coalescing unit.
> 
> Statistics can be used to monitor the basic coalescing status, the
> 'out of order' and 'out of window' means how many retransmitting packets,
> thus describe the performance intuitively.
> 
> Difference between ip v4 and v6 processing:
>  Fragment length in ipv4 header includes itself, while it's not
>  included for ipv6, thus means ipv6 can carry a real 65535 payload.
> 
> Signed-off-by: Wei Xu 
> Signed-off-by: Yuri Benditovich 
> ---
>  hw/net/virtio-net.c | 648 +++-
>  include/hw/virtio/virtio-net.h  |  81 +++
>  include/net/eth.h   |   2 +
>  include/standard-headers/linux/virtio_net.h |   8 +
>  4 files changed, 734 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 385b1a03e9..43a7021409 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -41,6 +41,28 @@
>  #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
>  #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
>  
> +#define VIRTIO_NET_IP4_ADDR_SIZE   8/* ipv4 saddr + daddr */
> +
> +#define VIRTIO_NET_TCP_FLAG 0x3F
> +#define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
> +
> +/* IPv4 max payload, 16 bits in the header */
> +#define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
> +#define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
> +
> +/* header length value in ip header without option */
> +#define 

Re: [Qemu-devel] [PATCH v1 2/3] intel-iommu: extend VTD emulation to allow 57-bit IOVA address width.

2018-11-12 Thread Peter Xu
On Mon, Nov 12, 2018 at 05:42:01PM +0800, Yu Zhang wrote:
> On Mon, Nov 12, 2018 at 04:36:34PM +0800, Peter Xu wrote:
> > On Fri, Nov 09, 2018 at 07:49:46PM +0800, Yu Zhang wrote:
> > > A 5-level paging capable VM may choose to use 57-bit IOVA address width.
> > > E.g. guest applications like DPDK prefer to use its VA as IOVA when
> > > performing VFIO map/unmap operations, to avoid the burden of managing the
> > > IOVA space.
> > 
> > Since you mentioned about DPDK... I'm just curious that whether have
> > you tested the patchset with the 57bit-enabled machines with DPDK VA
> > mode running in the guest? That would be something nice to mention in
> > the cover letter if you have.
> > 
> 
> Hah. Maybe I shall not mention DPDK here. 
> 
> The story is that we heard the requirement, saying applications like DPDK
> would need 5-level paging in IOMMU side. And I was convinced after checked
> DPDK code, seeing it may use VA as IOVA directly. But I did not test this
> patch with DPDK.
> 
> Instead, I used kvm-unit-test to verify this patch series. And of course, I
> also did some modification to the test case. Patch for the test also sent out
> at https://www.spinics.net/lists/kvm/msg177425.html.

Yeah that's perfectly fine for me.  So instead maybe you can also
mention the kvm-unit-test in the cover letter if you gonna repost.

> 
> > [...]
> > 
> > > @@ -3264,11 +3286,19 @@ static bool vtd_decide_config(IntelIOMMUState *s, 
> > > Error **errp)
> > >  }
> > >  }
> > >  
> > > -/* Currently only address widths supported are 39 and 48 bits */
> > > +/* Currently address widths supported are 39, 48, and 57 bits */
> > >  if ((s->aw_bits != VTD_AW_39BIT) &&
> > > -(s->aw_bits != VTD_AW_48BIT)) {
> > > -error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
> > > -   VTD_AW_39BIT, VTD_AW_48BIT);
> > > +(s->aw_bits != VTD_AW_48BIT) &&
> > > +(s->aw_bits != VTD_AW_57BIT)) {
> > > +error_setg(errp, "Supported values for x-aw-bits are: %d, %d, 
> > > %d",
> > > +   VTD_AW_39BIT, VTD_AW_48BIT, VTD_AW_57BIT);
> > > +return false;
> > > +}
> > > +
> > > +if ((s->aw_bits == VTD_AW_57BIT) &&
> > > +!(host_has_la57() && guest_has_la57())) {
> > > +error_setg(errp, "Do not support 57-bit DMA address, unless both 
> > > "
> > > + "host and guest are capable of 5-level 
> > > paging.\n");
> > 
> > Is there any context (or pointer to previous discussions would work
> > too) on explaining why we don't support some scenarios like
> > host_paw=48,guest_paw=48,guest_gaw=57?
> > 
> 
> Well, above check is only to make sure both the host and the guest can
> use 57bit linear address, which requires 5-level paging. So I believe
> we do support scenarios like host_paw=48,guest_paw=48,guest_gaw=57.
> The guest_has_la57() means the guest can use 57-bit linear address,
> regardless of its physical address width.

Sorry for my incorrect wording.  I mean when host/guest CPU only
support 4-level LA then would/should we allow the guest IOMMU to
support 5-level IOVA?  Asked since I'm thinking whether I can run the
series a bit with my laptop/servers.

Since at it, another thing I thought about is making sure the IOMMU
capabilities will match between host and guest IOMMU, which I think
this series has ignorred so far.  E.g., when we're having assigned
devices in the guest and with 5-level IOVA, we should make sure the
host IOMMU supports 5-level as well before the guest starts since
otherwise the shadow page synchronization could potentially fail when
the requested IOVA address goes beyond 4-level.  One simple solution
is just to disable device assignment for now when we're with 57bits
vIOMMU but I'm not sure whether that's what you want, especially you
mentioned the DPDK case (who may use assigned devices).

(sorry to have mentioned the dpdk case again :)

Regards,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH] 9p: write lock path in v9fs_co_open2()

2018-11-12 Thread zhibin hu
Sorry, i have no time to make poc recently.

IMHO, the implementation of v9fs_path_copy is not secure, it first free the
original value and than copy the new value, there is a race.

So each caller must ensure the synchronization, maybe more locks are needed.

thanks.


On Mon, Nov 12, 2018 at 10:39 PM Greg Kurz  wrote:

> On Mon, 12 Nov 2018 12:19:29 +0100
> Greg Kurz  wrote:
>
> > On Mon, 12 Nov 2018 19:05:59 +0800
> > zhibin hu  wrote:
> >
> > > yes, and this :
> > >
> >
> > Yeah, all call sites of v9fs_path_copy() in v9fs_create() are called in
> the
> > context of the main thread. They may race with any other access to the
> fid
> > path performed by some other command in the context of a worker thread.
> My
> > first guess is that v9fs_create() should take the write lock before
> writing
> > to the fid path.
> >
>
> I think this call to v9fs_path_copy() in v9fs_walk() can also race:
>
> if (fid == newfid) {
> if (fidp->fid_type != P9_FID_NONE) {
> err = -EINVAL;
> goto out;
> }
> v9fs_path_copy(>path, );
> } else {
>
>
> Worse, since v9fs_co_open2() may overwrite the fid path from a worker
> thread, it seems that some more code might require to run with the read
> lock taken...
>
> > BTW, if you could share all the reproducers you already have for these
> > heap-use-after-free issues, it would be appreciated, and probably speed
> > up the fixing.
> >
> > > ==6094==ERROR: AddressSanitizer: heap-use-after-free on address
> > > 0x602e6751 at pc 0x562a8dc492b8 bp 0x7f6805d2fa10 sp 0x7f6805d2fa00
> > > READ of size 1 at 0x602e6751 thread T21
> > > #0 0x562a8dc492b7 in local_open_nofollow hw/9pfs/9p-local.c:59
> > > #1 0x562a8dc49361 in local_opendir_nofollow hw/9pfs/9p-local.c:92
> > > #2 0x562a8dc4bd6e in local_mknod hw/9pfs/9p-local.c:662
> > > #3 0x562a8dc521de in v9fs_co_mknod hw/9pfs/cofs.c:200
> > > #4 0x562a8dc4413e in v9fs_mknod hw/9pfs/9p.c:3044
> > > #5 0x562a8e600976 in coroutine_trampoline
> util/coroutine-ucontext.c:116
> > > #6 0x7f68713635ff in __correctly_grouped_prefixwc
> > > (/lib64/libc.so.6+0x4c5ff)
> > >
> > > 0x602e6751 is located 1 bytes inside of 2-byte region
> > > [0x602e6750,0x602e6752)
> > > freed by thread T0 here:
> > > #0 0x7f687cdb9880 in __interceptor_free
> (/lib64/libasan.so.5+0xee880)
> > > #1 0x7f687c1494d1 in g_free (/lib64/libglib-2.0.so.0+0x524d1)
> > > #2 0x562a8dc30ce4 in v9fs_path_copy hw/9pfs/9p.c:195
> > > #3 0x562a8dc3f0f3 in v9fs_create hw/9pfs/9p.c:2286
> > > #4 0x562a8e600976 in coroutine_trampoline
> util/coroutine-ucontext.c:116
> > > #5 0x7f68713635ff in __correctly_grouped_prefixwc
> > > (/lib64/libc.so.6+0x4c5ff)
> > >
> > > previously allocated by thread T5 here:
> > > #0 0x7f687cdb9c48 in malloc (/lib64/libasan.so.5+0xeec48)
> > > #1 0x7f6871421c37 in __GI___vasprintf_chk
> (/lib64/libc.so.6+0x10ac37)
> > >
> > > Thread T21 created by T0 here:
> > > #0 0x7f687cd16443 in pthread_create (/lib64/libasan.so.5+0x4b443)
> > > #1 0x562a8e5bf61e in qemu_thread_create
> util/qemu-thread-posix.c:534
> > > #2 0x562a8e5adbe4 in do_spawn_thread util/thread-pool.c:135
> > > #3 0x562a8e5adc73 in spawn_thread_bh_fn util/thread-pool.c:143
> > > #4 0x562a8e5aa4d0 in aio_bh_call util/async.c:90
> > > #5 0x562a8e5aa787 in aio_bh_poll util/async.c:118
> > > #6 0x562a8e5b65bd in aio_dispatch util/aio-posix.c:436
> > > #7 0x562a8e5ab26f in aio_ctx_dispatch util/async.c:261
> > > #8 0x7f687c1438ac in g_main_context_dispatch
> > > (/lib64/libglib-2.0.so.0+0x4c8ac)
> > >
> > > Thread T5 created by T0 here:
> > > #0 0x7f687cd16443 in pthread_create (/lib64/libasan.so.5+0x4b443)
> > > #1 0x562a8e5bf61e in qemu_thread_create
> util/qemu-thread-posix.c:534
> > > #2 0x562a8d7a2258 in qemu_kvm_start_vcpu
> /root/qemu-3.0.0/cpus.c:1935
> > > #3 0x562a8d7a2a0b in qemu_init_vcpu /root/qemu-3.0.0/cpus.c:2001
> > > #4 0x562a8da3ef0c in x86_cpu_realizefn
> > > /root/qemu-3.0.0/target/i386/cpu.c:4996
> > > #5 0x562a8dd3a1e8 in device_set_realized hw/core/qdev.c:826
> > > #6 0x562a8e2a1e62 in property_set_bool qom/object.c:1984
> > > #7 0x562a8e29db0e in object_property_set qom/object.c:1176
> > > #8 0x562a8e2a4b00 in object_property_set_qobject
> qom/qom-qobject.c:27
> > > #9 0x562a8e29de2b in object_property_set_bool qom/object.c:1242
> > > #10 0x562a8d9ad452 in pc_new_cpu /root/qemu-3.0.0/hw/i386/pc.c:1107
> > > #11 0x562a8d9ad993 in pc_cpus_init
> /root/qemu-3.0.0/hw/i386/pc.c:1155
> > > #12 0x562a8d9b79cb in pc_init1
> /root/qemu-3.0.0/hw/i386/pc_piix.c:153
> > > #13 0x562a8d9b981d in pc_init_v3_0
> > > /root/qemu-3.0.0/hw/i386/pc_piix.c:438
> > > #14 0x562a8dd4bf29 in machine_run_board_init hw/core/machine.c:830
> > > #15 0x562a8db8d46e in main /root/qemu-3.0.0/vl.c:4516
> > > #16 0x7f687133a11a in __libc_start_main (/lib64/libc.so.6+0x2311a)
> > 

[Qemu-devel] [Bug 1802915] Re: GTK display refresh rate is throttled

2018-11-12 Thread Chen Zhang
** Description changed:

  Guest OS running with GL enabled GTK display shows a reduced refresh
  rate, e.g. moving cursor around with iGVT-g DMA Buf.
  
- It seems that a default refresh interval GUI_REFRESH_INTERVAL_DEFAULT
+ Apparently, a default refresh interval GUI_REFRESH_INTERVAL_DEFAULT
  (30ms) is defined in include/ui/console.h, throttling the display
  refresh rate at 33Hz.
  
- To correct this throttle issue, a shorter interval should be applied to
- display change listener or the default value should be used.
+ To correct this throttle issue, a shorter interval (16 or 17
+ milliseconds) should be applied to display change listener or the
+ default value should be modified.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1802915

Title:
  GTK display refresh rate is throttled

Status in QEMU:
  New

Bug description:
  Guest OS running with GL enabled GTK display shows a reduced refresh
  rate, e.g. moving cursor around with iGVT-g DMA Buf.

  Apparently, a default refresh interval GUI_REFRESH_INTERVAL_DEFAULT
  (30ms) is defined in include/ui/console.h, throttling the display
  refresh rate at 33Hz.

  To correct this throttle issue, a shorter interval (16 or 17
  milliseconds) should be applied to display change listener or the
  default value should be modified.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1802915/+subscriptions



[Qemu-devel] [PULL for-3.1 0/2] qemu-ga patch queue for 3.1.0

2018-11-12 Thread Michael Roth
The following changes since commit 160e5c22e55b3f775c2003dfc626fa872ee4a7a1:

  Merge remote-tracking branch 'remotes/gkurz/tags/for-upstream' into staging 
(2018-11-09 10:54:10 +)

are available in the Git repository at:

  git://github.com/mdroth/qemu.git tags/qga-pull-2018-11-12-tag

for you to fetch changes up to 61baac2fdb7ad3891fb00bbd3c9e8b8ca87f0f62:

  qga: Add multiple include guard to guest-agent-core.h (2018-11-09 07:55:13 
-0600)


qemu-ga patch queue for 3.1.0

* add missing #include guards for guest-agent-core.h
* fix leaks introduced with recent win32 enablement of disk info in
  guest-get-fsinfo


Marc-André Lureau (1):
  qga-win: fix leaks of build_guest_disk_info()

Peter Maydell (1):
  qga: Add multiple include guard to guest-agent-core.h

 qga/commands-win32.c   | 5 -
 qga/guest-agent-core.h | 5 +
 2 files changed, 9 insertions(+), 1 deletion(-)





[Qemu-devel] [PULL for-3.1 1/2] qga-win: fix leaks of build_guest_disk_info()

2018-11-12 Thread Michael Roth
From: Marc-André Lureau 

Introduced in commit b1ba8890e63ce9432c41c5c3fc229f54c87c9c99, vol_h
handle should be closed, and "out" cleanup should be done after
DeviceIoControl() fails.

Signed-off-by: Marc-André Lureau 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Michael Roth 
---
 qga/commands-win32.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index ef1d7d48d2..62e1b51dfe 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -797,7 +797,7 @@ static GuestDiskAddressList *build_guest_disk_info(char 
*guid, Error **errp)
 0, extents, size, NULL, NULL)) {
 error_setg_win32(errp, GetLastError(),
 "failed to get disk extents");
-return NULL;
+goto out;
 }
 } else if (last_err == ERROR_INVALID_FUNCTION) {
 /* Possibly CD-ROM or a shared drive. Try to pass the volume */
@@ -855,6 +855,9 @@ static GuestDiskAddressList *build_guest_disk_info(char 
*guid, Error **errp)
 
 
 out:
+if (vol_h != INVALID_HANDLE_VALUE) {
+CloseHandle(vol_h);
+}
 qapi_free_GuestDiskAddress(disk);
 g_free(extents);
 g_free(name);
-- 
2.17.1




[Qemu-devel] [PULL for-3.1 2/2] qga: Add multiple include guard to guest-agent-core.h

2018-11-12 Thread Michael Roth
From: Peter Maydell 

The guest-agent-core.h header was missing the usual guards
against multiple inclusion; add them.

(Spotted by lgtm.com's static analyzer.)

Signed-off-by: Peter Maydell 
Reviewed-by: Marc-André Lureau 
Reviewed-by: Philippe Mathieu-Daudé 
Tested-by: Philippe Mathieu-Daudé 
Signed-off-by: Michael Roth 
---
 qga/guest-agent-core.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/qga/guest-agent-core.h b/qga/guest-agent-core.h
index 6f4d214cb9..60eae16f27 100644
--- a/qga/guest-agent-core.h
+++ b/qga/guest-agent-core.h
@@ -10,6 +10,9 @@
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
+#ifndef GUEST_AGENT_CORE_H
+#define GUEST_AGENT_CORE_H
+
 #include "qapi/qmp/dispatch.h"
 #include "qemu-common.h"
 #include "qga-qapi-types.h"
@@ -46,3 +49,5 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp);
 #ifndef _WIN32
 void reopen_fd_to_null(int fd);
 #endif
+
+#endif /* GUEST_AGENT_CORE_H */
-- 
2.17.1




Re: [Qemu-devel] [PATCH v2] Acceptance tests: add Linux initrd checking test

2018-11-12 Thread Eduardo Habkost
On Fri, Nov 09, 2018 at 01:21:53PM -0500, Wainer dos Santos Moschetta wrote:
> QEMU used to exits with a not accurate error message when
> an initrd > 2GiB was passed. That was fixed on patch:
> 
>   commit f3839fda5771596152b75dd1e1a6d050e6e6e380
>   Author: Li Zhijian 
>   Date:   Thu Sep 13 18:07:13 2018 +0800
> 
>   change get_image_size return type to int64_t
> 
> This change adds a regression test for that fix. It starts
> QEMU with a 2GiB dummy initrd, and check it evaluates the file
> size correctly and prints accurate message.
> 
> Signed-off-by: Wainer dos Santos Moschetta 
> Reviewed-by: Caio Carrara 
> Reviewed-by: Cleber Rosa 
> Reviewed-by: Eduardo Habkost 
> Reviewed-by: Philippe Mathieu-Daudé 
> Tested-by: Philippe Mathieu-Daudé 

Queued for 4.0, thanks!

-- 
Eduardo



Re: [Qemu-devel] [PATCH] nvme: fix oob access issue(CVE-2018-16847)

2018-11-12 Thread Li Qiang
Ping what't the status of this patch.

I see Kevin's new pr doesn't contain this patch.

Thanks,
Li Qiang

Li Qiang  于2018年11月2日周五 上午9:22写道:

> Currently, the nvme_cmb_ops mr doesn't check the addr and size.
> This can lead an oob access issue. This is triggerable in the guest.
> Add check to avoid this issue.
>
> Fixes CVE-2018-16847.
>
> Reported-by: Li Qiang 
> Reviewed-by: Paolo Bonzini 
> Signed-off-by: Li Qiang 
> ---
>  hw/block/nvme.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index fc7dacb..d097add 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1175,6 +1175,10 @@ static void nvme_cmb_write(void *opaque, hwaddr
> addr, uint64_t data,
>  unsigned size)
>  {
>  NvmeCtrl *n = (NvmeCtrl *)opaque;
> +
> +if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
> +return;
> +}
>  memcpy(>cmbuf[addr], , size);
>  }
>
> @@ -1183,6 +1187,9 @@ static uint64_t nvme_cmb_read(void *opaque, hwaddr
> addr, unsigned size)
>  uint64_t val;
>  NvmeCtrl *n = (NvmeCtrl *)opaque;
>
> +if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
> +return 0;
> +}
>  memcpy(, >cmbuf[addr], size);
>  return val;
>  }
> --
> 1.8.3.1
>
>


[Qemu-devel] [PATCH] memory: check write/read_with_attrs in memory dispatch

2018-11-12 Thread Li Qiang
This can avoid the NULL-deref if the rm doesn't has a
read/write nor write/read_with_attrs callback.

Signed-off-by: Li Qiang 
---
 memory.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/memory.c b/memory.c
index d14c6dec1d..3baf5857b9 100644
--- a/memory.c
+++ b/memory.c
@@ -1377,13 +1377,15 @@ static MemTxResult 
memory_region_dispatch_read1(MemoryRegion *mr,
  mr->ops->impl.max_access_size,
  memory_region_read_accessor,
  mr, attrs);
-} else {
+} else if (mr->ops->read_with_attrs) {
 return access_with_adjusted_size(addr, pval, size,
  mr->ops->impl.min_access_size,
  mr->ops->impl.max_access_size,
  
memory_region_read_with_attrs_accessor,
  mr, attrs);
 }
+
+return MEMTX_DECODE_ERROR;
 }
 
 MemTxResult memory_region_dispatch_read(MemoryRegion *mr,
@@ -1454,7 +1456,7 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
  mr->ops->impl.max_access_size,
  memory_region_write_accessor, mr,
  attrs);
-} else {
+} else if (mr->ops->write_with_attrs) {
 return
 access_with_adjusted_size(addr, , size,
   mr->ops->impl.min_access_size,
@@ -1462,6 +1464,8 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr,
   memory_region_write_with_attrs_accessor,
   mr, attrs);
 }
+
+return MEMTX_DECODE_ERROR;
 }
 
 void memory_region_init_io(MemoryRegion *mr,
-- 
2.11.0




Re: [Qemu-devel] [PATCH for-3.2 1/7] tests/pvpanic: Make the pvpanic test independent of global_qtest

2018-11-12 Thread Eric Blake

On 11/12/18 1:08 PM, Thomas Huth wrote:

We want to get rid of global_qtest in the long run, thus do not
use the wrappers like inb() and outb() here anymore.

Signed-off-by: Thomas Huth 
---
  tests/pvpanic-test.c | 14 --
  1 file changed, 8 insertions(+), 6 deletions(-)


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-devel] How to emulate block I/O timeout on qemu side?

2018-11-12 Thread Dongli Zhang



On 11/13/2018 06:52 AM, Marc Olson via Qemu-devel wrote:
> On 11/11/18 11:36 PM, Dongli Zhang wrote:
>> On 11/12/2018 03:13 PM, Marc Olson via Qemu-devel wrote:
>>> On 11/3/18 10:24 AM, Dongli Zhang wrote:
 The 'write' latency of sector=40960 is set to a very large value. When the 
 I/O
 is stalled in guest due to that sector=40960 is accessed, I do see below
 messages in guest log:

 [   80.807755] nvme nvme0: I/O 11 QID 2 timeout, aborting
 [   80.808095] nvme nvme0: Abort status: 0x4001


 However, then nothing happens further. nvme I/O hangs in guest. I am not
 able to
 kill the qemu process with Ctrl+C. Both vnc and qemu user net do not work. 
 I
 need to kill qemu with "kill -9"


 The same result for virtio-scsi and qemu is stuck as well.
>>> While I didn't try virtio-scsi, I wasn't able to reproduce this behavior 
>>> using
>>> nvme on Ubuntu 18.04 (4.15). What image and kernel version are you trying
>>> against?
>> Would you like to reproduce the "aborting" message or the qemu hang?
> I could not reproduce IO hanging in the guest, but I can reproduce qemu 
> hanging.
>> guest image: ubuntu 16.04
>> guest kernel: mainline linux kernel (and default kernel in ubuntu 16.04)
>> qemu: qemu-3.0.0 (with the blkdebug delay patch)
>>
>> Would you be able to see the nvme abort (which is indeed not supported by 
>> qemu)
>> message in guest kernel?
> Yes.
>> Once I see that message, I would not be able to kill the qemu-system-x86_64
>> command line with Ctrl+C.
> 
> I missed this part. I wasn't expecting to handle very long timeouts, but what
> appears to be happening is that the sleep doesn't get interrupted on 
> shutdown. I
> suspect something like this, on top of the series I sent last night, should 
> help:
> 
> diff --git a/block/blkdebug.c b/block/blkdebug.c
> index 6b1f2d6..0bfb91b 100644
> --- a/block/blkdebug.c
> +++ b/block/blkdebug.c
> @@ -557,8 +557,11 @@ static int rule_check(BlockDriverState *bs, uint64_t
> offset, uint64_t bytes)
>  remove_active_rule(s, delay_rule);
>  }
> 
> -if (latency != 0) {
> -qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, latency);
> +while (latency > 0 && 
> !aio_external_disabled(bdrv_get_aio_context(bs))) {
> +int64_t cur_latency = MIN(latency, 10ULL);
> +
> +qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, cur_latency);
> +latency -= cur_latency;
>  }
>  }
> 
> 
> /marc
> 
> 

I am able to interrupt qemu with above patch to periodically wake up and sleep
again.

Dongli Zhang



Re: [Qemu-devel] [QEMU PATCH v2 0/2]: KVM: i386: Add support for save and restore nested state

2018-11-12 Thread Liran Alon



> On 13 Nov 2018, at 2:07, Jim Mattson  wrote:
> 
> On Mon, Nov 12, 2018 at 4:00 PM, Liran Alon  wrote:
>> 
>> 
>>> On 12 Nov 2018, at 18:54, Daniel P. Berrangé  wrote:
>>> 
>>> On Mon, Nov 12, 2018 at 04:50:54PM +, Dr. David Alan Gilbert wrote:
 * Daniel P. Berrangé (berra...@redhat.com) wrote:
> On Sun, Nov 04, 2018 at 11:19:57PM +0100, Paolo Bonzini wrote:
>> On 02/11/2018 17:54, Daniel P. Berrangé wrote:
>>> We have usually followed a rule that new machine types must not
>>> affect runability of a VM on a host. IOW new machine types should
>>> not introduce dependancies on specific kernels, or hardware features
>>> such as CPU flags.
>> 
>>> Anything that requires a new kernel feature thus ought to be an
>>> opt-in config tunable on the CLI, separate from machine type
>>> choice.
>> 
>> Unless someone tinkered with the module parameters, they could not even
>> use nested virtualization before 4.20.  So for everyone else, "-cpu
>> ...,+vmx" does count as an "opt-in config tunable on the CLI" that
>> requires 4.20.
>> 
>> For those that did tinker with module parameters, we can grandfather in
>> the old machine types, so that they can use nested virtualization with
>> no live migration support.  For those that did not, however, I don't
>> think it makes sense to say "oh by the way I really want to be able to
>> migrate this VM" on the command line, or even worse on the monitor.
> 
> IIUC, 4.20 is only required from POV of migration state. Is it thus
> possible to just register a migration blocker if QEMU is launched
> on a host with kernel < 4.20.
> 
> Migration has always been busted historically, so those people using
> nested VMX already won't be hurt by not having ability to live migrate
> their VM, but could otherwise continue using them without being forced
> to upgrade their kernel to fix a feature they're not even using.
 
 Yes, although I am a bit worried we might have a population of users
 that:
  a) Have enabled nesting
  b) Run VMs with vmx enabled
>>> 
>>> 
  c) Don't normally actually run nested guests
  d) Currently happily migrate.
>>> 
>>> True, and (b) would include anyone using libvirt's  host-model CPU. So if
>>> you enabled nesting, have host-model for all guests, but only use nesting
>>> in one of the guests, you'd be doomed.
>>> 
>>> Is it possible for QEMU to determine if there are nested guests running or
>>> not and conditionally block migration appropriately to ensure safety ?
>> 
>> 
>> Only if kernel supports KVM_CAP_NESTED_STATE.
>> See my reply to Dave in this thread.
> 
> You could still allow migration if CR4.VMXE is clear.

Agreed. Nice addition :)

Thanks,
-Liran





Re: [Qemu-devel] [QEMU PATCH v2 0/2]: KVM: i386: Add support for save and restore nested state

2018-11-12 Thread Liran Alon



> On 12 Nov 2018, at 18:50, Dr. David Alan Gilbert  wrote:
> 
> * Daniel P. Berrangé (berra...@redhat.com) wrote:
>> On Sun, Nov 04, 2018 at 11:19:57PM +0100, Paolo Bonzini wrote:
>>> On 02/11/2018 17:54, Daniel P. Berrangé wrote:
 We have usually followed a rule that new machine types must not
 affect runability of a VM on a host. IOW new machine types should
 not introduce dependancies on specific kernels, or hardware features
 such as CPU flags.
>>> 
 Anything that requires a new kernel feature thus ought to be an
 opt-in config tunable on the CLI, separate from machine type
 choice.
>>> 
>>> Unless someone tinkered with the module parameters, they could not even
>>> use nested virtualization before 4.20.  So for everyone else, "-cpu
>>> ...,+vmx" does count as an "opt-in config tunable on the CLI" that
>>> requires 4.20.
>>> 
>>> For those that did tinker with module parameters, we can grandfather in
>>> the old machine types, so that they can use nested virtualization with
>>> no live migration support.  For those that did not, however, I don't
>>> think it makes sense to say "oh by the way I really want to be able to
>>> migrate this VM" on the command line, or even worse on the monitor.
>> 
>> IIUC, 4.20 is only required from POV of migration state. Is it thus
>> possible to just register a migration blocker if QEMU is launched
>> on a host with kernel < 4.20.
>> 
>> Migration has always been busted historically, so those people using
>> nested VMX already won't be hurt by not having ability to live migrate
>> their VM, but could otherwise continue using them without being forced
>> to upgrade their kernel to fix a feature they're not even using.
> 
> Yes, although I am a bit worried we might have a population of users
> that:
>   a) Have enabled nesting
>   b) Run VMs with vmx enabled
>   c) Don't normally actually run nested guests
>   d) Currently happily migrate.
> 
> Dave

First of all, we should put the entire kvm_nested_state in a VMState subsection 
that has a .needed() method
that checks if ((format==VMX) && (vmx.vmxon_pa != -1ull));
This will allow migrating a VM that has nested exposed but didn’t really enter 
VMX operation to still be able
to successfully migrate to old hosts without any issues.

The only problem remaining to be solved that you discuss here is what happens 
if user runs with modern QEMU
(Including my to-be-written v3 patches discussed here) on a host that has 
kernel without KVM_CAP_NESTED_STATE.
In this kernel, it is impossible for userspace (e.g. QEMU) to know if guest 
that was exposed with VMX actually used it.
So for those cases, I see no option but to modify QEMU to also say that if 
guest is exposed with VMX and
we are running on kernel without KVM_CAP_NESTED_STATE, then this is a migration 
blocker.

-Liran

> 
>> Regards,
>> Daniel
>> -- 
>> |: 
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__berrange.com=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=rs9ZkUUS37SHrs_oZJ9uIiXtpXvUkJBpfVEe9OSiQzk=
>>   -o-
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.flickr.com_photos_dberrange=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=u_hnAFyY8BVwto0FsomTcZ3dmLKPYb1hwI_jRXI6EZg=
>>  :|
>> |: 
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__libvirt.org=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=SEIw983ixpJUUySxUwrxtIbnjvHTB9ff3MaqULaulQw=
>>  -o-
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__fstop138.berrange.com=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=yvxjeOrwjXjf08RBhPdX53lJN1W-8WSXT25ZeMSA06k=
>>  :|
>> |: 
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__entangle-2Dphoto.org=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=9TIjtmf6AVFYWbyzI5vl-zXTaNCSCMAxyck92pc8yvY=
>> -o-
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.instagram.com_dberrange=DwIDAw=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=ximE6x4FrQ5vo3D4wF3w-cVsKgtNs85wCTL5GiP_B5Q=Wapdtm0yT4j-9EjMoxwo9QvRZ3h9Fk_CvpQq8J4TDpg=
>>  :|
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [Qemu-devel] [QEMU PATCH v2 0/2]: KVM: i386: Add support for save and restore nested state

2018-11-12 Thread Liran Alon



> On 12 Nov 2018, at 18:54, Daniel P. Berrangé  wrote:
> 
> On Mon, Nov 12, 2018 at 04:50:54PM +, Dr. David Alan Gilbert wrote:
>> * Daniel P. Berrangé (berra...@redhat.com) wrote:
>>> On Sun, Nov 04, 2018 at 11:19:57PM +0100, Paolo Bonzini wrote:
 On 02/11/2018 17:54, Daniel P. Berrangé wrote:
> We have usually followed a rule that new machine types must not
> affect runability of a VM on a host. IOW new machine types should
> not introduce dependancies on specific kernels, or hardware features
> such as CPU flags.
 
> Anything that requires a new kernel feature thus ought to be an
> opt-in config tunable on the CLI, separate from machine type
> choice.
 
 Unless someone tinkered with the module parameters, they could not even
 use nested virtualization before 4.20.  So for everyone else, "-cpu
 ...,+vmx" does count as an "opt-in config tunable on the CLI" that
 requires 4.20.
 
 For those that did tinker with module parameters, we can grandfather in
 the old machine types, so that they can use nested virtualization with
 no live migration support.  For those that did not, however, I don't
 think it makes sense to say "oh by the way I really want to be able to
 migrate this VM" on the command line, or even worse on the monitor.
>>> 
>>> IIUC, 4.20 is only required from POV of migration state. Is it thus
>>> possible to just register a migration blocker if QEMU is launched
>>> on a host with kernel < 4.20.
>>> 
>>> Migration has always been busted historically, so those people using
>>> nested VMX already won't be hurt by not having ability to live migrate
>>> their VM, but could otherwise continue using them without being forced
>>> to upgrade their kernel to fix a feature they're not even using.
>> 
>> Yes, although I am a bit worried we might have a population of users
>> that:
>>   a) Have enabled nesting
>>   b) Run VMs with vmx enabled
> 
> 
>>   c) Don't normally actually run nested guests
>>   d) Currently happily migrate.
> 
> True, and (b) would include anyone using libvirt's  host-model CPU. So if
> you enabled nesting, have host-model for all guests, but only use nesting
> in one of the guests, you'd be doomed.
> 
> Is it possible for QEMU to determine if there are nested guests running or
> not and conditionally block migration appropriately to ensure safety ?


Only if kernel supports KVM_CAP_NESTED_STATE.
See my reply to Dave in this thread.

-Liran

> 
> 
> Regards,
> Daniel
> -- 
> |: 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__berrange.com=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=DIzWfmRGWO1b6hzL9NRbIt41fiFcnPt0MC8917u4Qv0=
>   -o-
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.flickr.com_photos_dberrange=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=CjA-joyt2Y9t5B4YzIiupfY8EEO58m4vbmnd45adzFI=
>  :|
> |: 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__libvirt.org=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=tD05tikOHMJhh_EeZ2Esoxb0oku3MPFmj-S2YHdUGm0=
>  -o-
> https://urldefense.proofpoint.com/v2/url?u=https-3A__fstop138.berrange.com=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=YAh1WAoXQKEB6hkMmG6ZnJQETOFnq6eqQLmJokME80A=
>  :|
> |: 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__entangle-2Dphoto.org=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=90Mm1Qb-SHe8P63xwGp6gzMU1I5DEW6YX0ttG6TL_7g=
> -o-
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.instagram.com_dberrange=DwIDaQ=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=Jk6Q8nNzkQ6LJ6g42qARkg6ryIDGQr-yKXPNGZbpTx0=eMOrT-7t7-tfRtTw2da9c1YTU0_tOFfkVIhj9mWv-Pc=l4NrrDdRzPClvYQdxQdfIW0geHPWcukeyOGX8QapwYA=
>  :|




Re: [Qemu-devel] [QEMU PATCH v2 0/2]: KVM: i386: Add support for save and restore nested state

2018-11-12 Thread Jim Mattson via Qemu-devel
On Mon, Nov 12, 2018 at 4:00 PM, Liran Alon  wrote:
>
>
>> On 12 Nov 2018, at 18:54, Daniel P. Berrangé  wrote:
>>
>> On Mon, Nov 12, 2018 at 04:50:54PM +, Dr. David Alan Gilbert wrote:
>>> * Daniel P. Berrangé (berra...@redhat.com) wrote:
 On Sun, Nov 04, 2018 at 11:19:57PM +0100, Paolo Bonzini wrote:
> On 02/11/2018 17:54, Daniel P. Berrangé wrote:
>> We have usually followed a rule that new machine types must not
>> affect runability of a VM on a host. IOW new machine types should
>> not introduce dependancies on specific kernels, or hardware features
>> such as CPU flags.
>
>> Anything that requires a new kernel feature thus ought to be an
>> opt-in config tunable on the CLI, separate from machine type
>> choice.
>
> Unless someone tinkered with the module parameters, they could not even
> use nested virtualization before 4.20.  So for everyone else, "-cpu
> ...,+vmx" does count as an "opt-in config tunable on the CLI" that
> requires 4.20.
>
> For those that did tinker with module parameters, we can grandfather in
> the old machine types, so that they can use nested virtualization with
> no live migration support.  For those that did not, however, I don't
> think it makes sense to say "oh by the way I really want to be able to
> migrate this VM" on the command line, or even worse on the monitor.

 IIUC, 4.20 is only required from POV of migration state. Is it thus
 possible to just register a migration blocker if QEMU is launched
 on a host with kernel < 4.20.

 Migration has always been busted historically, so those people using
 nested VMX already won't be hurt by not having ability to live migrate
 their VM, but could otherwise continue using them without being forced
 to upgrade their kernel to fix a feature they're not even using.
>>>
>>> Yes, although I am a bit worried we might have a population of users
>>> that:
>>>   a) Have enabled nesting
>>>   b) Run VMs with vmx enabled
>>
>>
>>>   c) Don't normally actually run nested guests
>>>   d) Currently happily migrate.
>>
>> True, and (b) would include anyone using libvirt's  host-model CPU. So if
>> you enabled nesting, have host-model for all guests, but only use nesting
>> in one of the guests, you'd be doomed.
>>
>> Is it possible for QEMU to determine if there are nested guests running or
>> not and conditionally block migration appropriately to ensure safety ?
>
>
> Only if kernel supports KVM_CAP_NESTED_STATE.
> See my reply to Dave in this thread.

You could still allow migration if CR4.VMXE is clear.



Re: [Qemu-devel] [PATCH 0/2] linux-user/mips: Support the n32 ABI for the R5900

2018-11-12 Thread Maciej W. Rozycki
On Fri, 9 Nov 2018, Maciej W. Rozycki wrote:

> > Some readelf results:
> > 
> > mips64el/stretch
> > 
> >   Magic:   7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00
> >   Class:   ELF64
> >   Flags:   0x8007, noreorder, pic, cpic, mips64r2
> 
>  Hmm, that's weird -- what executable did you check?  There may be some 
> that are n64, or maybe they've switched (which I would applaud, FWIW).  I 
> remember seeing mostly n32, with minimal support for n64, but that was a 
> while ago -- jessie or suchlike, I believe.  Using MIPS64r2 as the base 
> ISA also looks new to me, that used to be plain MIPS III, and some of 
> Debian's MIPS build systems used to be either MIPS III (Lemote Loongson) 
> or MIPS64r1 (Broadcom SiByte).

 OK, I definitely got this confused.  I did some checking and jessie 
didn't even have a 64-bit MIPS port.  I got their build systems right 
though, and the kernel is 64-bit for systems that support it.

> > Any binaries that need qemu-mipsn32 or qemu-mipsn32el?
> 
>  I'd expect at least the n32 dynamic loader (along with libc and some 
> other essential DSOs) to be present with MIPS64 Debian.  Traditionally, 
> under the FHS rules, it would be installed as /lib32/ld.so.1 (with the o32 
> one as /lib/ld.so.1 and the n64 as /lib64/ld.so.1), but Debian uses their 
> own multiarch filesystem layout standard, and offhand I don't remember 
> what the paths are defined to there.

 So with jessie you can install the `libc6-dev-mipsn32' package, which 
will get you n32 glibc development libraries and will pull the 
complementing n32 dynamic loader (at /lib32/ld.so.1 actually) and n32 
glibc shared libraries as well.

 Unfortunately multilib support files, such as the CRT files, seem to be 
missing from GCC for n32 or I cannot find them.  Otherwise you would be 
able to compile and link n32 binaries just by calling `gcc -mabi=n32'.  
Still the dynamic loader is directly runnable, as I noted above.

 HTH,

  Maciej



Re: [Qemu-devel] [RFC PATCH 02/11] decodetree: Add multiple include guard

2018-11-12 Thread Philippe Mathieu-Daudé

On 12/11/18 23:30, Eduardo Habkost wrote:

On Mon, Nov 12, 2018 at 12:36:13AM +0100, Philippe Mathieu-Daudé wrote:

It is necessary when splitting an ISA, or when using multiple ISAs.

Signed-off-by: Philippe Mathieu-Daudé 
---
TODO: explain why, use case
TODO: escape full path?
---
  scripts/decodetree.py | 5 +
  1 file changed, 5 insertions(+)

diff --git a/scripts/decodetree.py b/scripts/decodetree.py
index 0bc73b5990..5dea15e7a5 100755
--- a/scripts/decodetree.py
+++ b/scripts/decodetree.py
@@ -1030,7 +1030,11 @@ def main():
  else:
  output_fd = sys.stdout
  
+hdr_guard = filename.split(os.path.sep)[-1].upper().replace('.', '_') + "_H"


Isn't
   filename.split(os.path.sep)[-1]
equivalent to
   os.path.basename(filename)
?


Yes, thanks :)



Re: [Qemu-devel] [PATCH v2 07/11] block: Leave BDS.backing_file constant

2018-11-12 Thread Eric Blake

On 8/9/18 5:31 PM, Max Reitz wrote:

Parts of the block layer treat BDS.backing_file as if it were whatever
the image header says (i.e., if it is a relative path, it is relative to
the overlay), other parts treat it like a cache for
bs->backing->bs->filename (relative paths are relative to the CWD).
Considering bs->backing->bs->filename exists, let us make it mean the
former.

Among other things, this now allows the user to specify a base when
using qemu-img to commit an image file in a directory that is not the
CWD (assuming, everything uses relative filenames).

Before this patch:

$ ./qemu-img create -f qcow2 foo/bot.qcow2 1M
$ ./qemu-img create -f qcow2 -b bot.qcow2 foo/mid.qcow2
$ ./qemu-img create -f qcow2 -b mid.qcow2 foo/top.qcow2
$ ./qemu-img commit -b mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'foo/mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b $PWD/foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find '[...]/foo/mid.qcow2' in the backing chain of 
'foo/top.qcow2'


Three failures in a row - no way to commit short of changing your 
working directory.




After this patch:

$ ./qemu-img commit -b mid.qcow2 foo/top.qcow2
Image committed.
$ ./qemu-img commit -b foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'foo/mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b $PWD/foo/mid.qcow2 foo/top.qcow2
Image committed.


Yay, that looks saner.



With this change, bdrv_find_backing_image() must look at whether the
user has overridden a BDS's backing file.  If so, it can no longer use
bs->backing_file, but must instead compare the given filename against
the backing node's filename directly.

Note that this changes the QAPI output for a node's backing_file.  We
had very inconsistent output there (sometimes what the image header
said, sometimes the actual filename of the backing image).  This
inconsistent output was effectively useless, so we have to decide one
way or the other.  Considering that bs->backing_file usually at runtime
contained the path to the image relative to qemu's CWD (or absolute),
this patch changes QAPI's backing_file to always report the
bs->backing->bs->filename from now on.  If you want to receive the image
header information, you have to refer to full-backing-filename.

This necessitates a change to iotest 228.  The interesting information
it really wanted is the image header, and it can get that now, but it
has to use full-backing-filename instead of backing_file.  Because of
this patch's changes to bs->backing_file's behavior, we also need some
reference output changes.

Along with the changes to bs->backing_file, stop updating
BDS.backing_format in bdrv_backing_attach() as well.  This necessitates
a change to the reference output of iotest 191.


Good explanations for the test changes.



Signed-off-by: Max Reitz 
---
  include/block/block_int.h  | 14 +-
  block.c| 29 ++---
  block/qapi.c   |  7 ---
  qemu-img.c | 12 ++--
  tests/qemu-iotests/191.out |  1 -
  tests/qemu-iotests/228 |  6 +++---
  tests/qemu-iotests/228.out |  6 +++---
  7 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index d3d8b22155..8f2c515ec1 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -737,11 +737,15 @@ struct BlockDriverState {
  bool walking_aio_notifiers; /* to make removal during iteration safe */
  
  char filename[PATH_MAX];

-char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
-this file image */
-/* The backing filename indicated by the image header; if we ever
- * open this file, then this is replaced by the resulting BDS's
- * filename (i.e. after a bdrv_refresh_filename() run). */
+/* If non-zero, the image is a diff of this image file.  Note that


Pre-existing, but that sentence might read nicer as:

If not empty, this image is a diff in relation to backing_file.


+ * this the name given in the image header and may therefore not


"this the name" is wrong; did you mean "this is the name" or "this name" 
or "the name"?



+ * be equal to .backing->bs->filename, and relative paths are
+ * resolved relatively to their overlay. */
+char backing_file[PATH_MAX];
+/* The backing filename indicated by the image header.  Contrary
+ * to backing_file, if we ever open this file, auto_backing_file
+ * is replaced by the resulting BDS's filename (i.e. after a
+ * bdrv_refresh_filename() run). */
  char auto_backing_file[PATH_MAX];
  char backing_format[16]; /* if non-zero and backing_file exists */
  


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266

Re: [Qemu-devel] [PATCH v2 06/11] iotests: Add tests for mirror @replaces loops

2018-11-12 Thread Eric Blake

On 8/9/18 5:31 PM, Max Reitz wrote:

This adds two tests for cases where our old check_to_replace_node()
function failed to detect that executing this job with these parameters
would result in a cyclic graph.

Signed-off-by: Max Reitz 
---
  tests/qemu-iotests/041 | 124 +
  tests/qemu-iotests/041.out |   4 +-
  2 files changed, 126 insertions(+), 2 deletions(-)



With your followup amendments to allow qed testing (hmm, you mention 
you'd be posting a v3),


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-devel] How to emulate block I/O timeout on qemu side?

2018-11-12 Thread Marc Olson via Qemu-devel

On 11/11/18 11:36 PM, Dongli Zhang wrote:

On 11/12/2018 03:13 PM, Marc Olson via Qemu-devel wrote:

On 11/3/18 10:24 AM, Dongli Zhang wrote:

The 'write' latency of sector=40960 is set to a very large value. When the I/O
is stalled in guest due to that sector=40960 is accessed, I do see below
messages in guest log:

[   80.807755] nvme nvme0: I/O 11 QID 2 timeout, aborting
[   80.808095] nvme nvme0: Abort status: 0x4001


However, then nothing happens further. nvme I/O hangs in guest. I am not able to
kill the qemu process with Ctrl+C. Both vnc and qemu user net do not work. I
need to kill qemu with "kill -9"


The same result for virtio-scsi and qemu is stuck as well.

While I didn't try virtio-scsi, I wasn't able to reproduce this behavior using
nvme on Ubuntu 18.04 (4.15). What image and kernel version are you trying 
against?

Would you like to reproduce the "aborting" message or the qemu hang?
I could not reproduce IO hanging in the guest, but I can reproduce qemu 
hanging.

guest image: ubuntu 16.04
guest kernel: mainline linux kernel (and default kernel in ubuntu 16.04)
qemu: qemu-3.0.0 (with the blkdebug delay patch)

Would you be able to see the nvme abort (which is indeed not supported by qemu)
message in guest kernel?

Yes.

Once I see that message, I would not be able to kill the qemu-system-x86_64
command line with Ctrl+C.


I missed this part. I wasn't expecting to handle very long timeouts, but 
what appears to be happening is that the sleep doesn't get interrupted 
on shutdown. I suspect something like this, on top of the series I sent 
last night, should help:


diff --git a/block/blkdebug.c b/block/blkdebug.c
index 6b1f2d6..0bfb91b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -557,8 +557,11 @@ static int rule_check(BlockDriverState *bs, 
uint64_t offset, uint64_t bytes)

 remove_active_rule(s, delay_rule);
 }

-    if (latency != 0) {
-    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, latency);
+    while (latency > 0 && 
!aio_external_disabled(bdrv_get_aio_context(bs))) {

+    int64_t cur_latency = MIN(latency, 10ULL);
+
+    qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, cur_latency);
+    latency -= cur_latency;
 }
 }


/marc




Re: [Qemu-devel] [for 3.1? Qemu-devel] [PATCH v2 05/11] block: Fix check_to_replace_node()

2018-11-12 Thread Eric Blake

On 8/9/18 5:31 PM, Max Reitz wrote:

Currently, check_to_replace_node() only allows mirror to replace a node
in the chain of the source node, and only if it is the first non-filter
node below the source.  Well, technically, the idea is that you can
exactly replace a quorum child by mirroring from quorum.

This has (probably) two reasons:
(1) We do not want to create loops.
(2) @replaces and @device should have exactly the same content so
 replacing them does not cause visible data to change.

This has two issues:
(1) It is overly restrictive.  It is completely fine for @replaces to be
 a filter.
(2) It is not restrictive enough.  You can create loops with this as
 follows:

$ qemu-img create -f qcow2 /tmp/source.qcow2 64M
$ qemu-system-x86_64 -qmp stdio
{"execute": "qmp_capabilities"}
{"execute": "object-add",
  "arguments": {"qom-type": "throttle-group", "id": "tg0"}}
{"execute": "blockdev-add",
  "arguments": {
  "node-name": "source",
  "driver": "throttle",
  "throttle-group": "tg0",
  "file": {
  "node-name": "filtered",
  "driver": "qcow2",
  "file": {
  "driver": "file",
  "filename": "/tmp/source.qcow2"
  } } } }
{"execute": "drive-mirror",
  "arguments": {
  "job-id": "mirror",
  "device": "source",
  "target": "/tmp/target.qcow2",
  "format": "qcow2",
  "node-name": "target",
  "sync" :"none",
  "replaces": "filtered"
  } }
{"execute": "block-job-complete", "arguments": {"device": "mirror"}}

And qemu crashes because of a stack overflow due to the loop being
created (target's backing file is source, so when it replaces filtered,
it points to itself through source).


Sounds like good material for inclusion in 3.1.



(blockdev-mirror can be broken similarly.)

So let us make the checks for the two conditions above explicit, which
makes the whole function exactly as restrictive as it needs to be.

Signed-off-by: Max Reitz 
---


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-devel] [PATCH v2 04/11] block: Storage child access function

2018-11-12 Thread Eric Blake

On 8/9/18 5:31 PM, Max Reitz wrote:

For completeness' sake, add a function for accessing a node's storage
child, too.  For filters, this is there filtered child; for non-filters,


s/there/their/


this is bs->file.

Some places are deliberately left unconverted:
- BDS opening/closing functions where bs->file is handled specially
   (which is basically wrong, but at least simplifies probing)
- bdrv_co_block_status_from_file(), because its name implies that it
   points to ->file


I'm wondering if we can clean up block_status to let filters have a NULL 
callback and io.c do the right thing automatically, rather than the 
current approach of filters assigning the callback to the common helper 
routine.  Maybe later in the series.



- bdrv_snapshot_goto() in one places unrefs bs->file.  Such a


s/places/place/


   modification is not covered by this patch and is therefore just
   safeguarded by an additional assert(), but otherwise kept as-is.

Signed-off-by: Max Reitz 
---



+++ b/block/snapshot.c



@@ -204,37 +207,38 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
  return ret;
  }
  
-if (bs->file) {

-BlockDriverState *file;
+storage_bs = bdrv_storage_bs(bs);
+if (storage_bs) {
  QDict *options = qdict_clone_shallow(bs->options);
  QDict *file_options;
  Error *local_err = NULL;
  
-file = bs->file->bs;

  /* Prevent it from getting deleted when detached from bs */
-bdrv_ref(file);
+bdrv_ref(storage_bs);
  
  qdict_extract_subqdict(options, _options, "file.");

  qobject_unref(file_options);
-qdict_put_str(options, "file", bdrv_get_node_name(file));
+qdict_put_str(options, "file", bdrv_get_node_name(storage_bs));
  
  drv->bdrv_close(bs);

+
+assert(bs->file->bs == storage_bs);


At first glance, this assertion...


  bdrv_unref_child(bs, bs->file);
  bs->file = NULL;
  
-ret = bdrv_snapshot_goto(file, snapshot_id, errp);

+ret = bdrv_snapshot_goto(storage_bs, snapshot_id, errp);
  open_ret = drv->bdrv_open(bs, options, bs->open_flags, _err);
  qobject_unref(options);
  if (open_ret < 0) {
-bdrv_unref(file);
+bdrv_unref(storage_bs);
  bs->drv = NULL;
  /* A bdrv_snapshot_goto() error takes precedence */
  error_propagate(errp, local_err);
  return ret < 0 ? ret : open_ret;
  }
  
-assert(bs->file->bs == file);

-bdrv_unref(file);
+assert(bs->file->bs == storage_bs);


...looks like a duplicate of this one. But looking closer, I see 
bs->file = NULL followed by drv->bdrv_open() in between which should 
reassign bs->file, so having the assertion on both ends makes sense.


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-devel] [RFC PATCH 02/11] decodetree: Add multiple include guard

2018-11-12 Thread Eduardo Habkost
On Mon, Nov 12, 2018 at 12:36:13AM +0100, Philippe Mathieu-Daudé wrote:
> It is necessary when splitting an ISA, or when using multiple ISAs.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> TODO: explain why, use case
> TODO: escape full path?
> ---
>  scripts/decodetree.py | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/scripts/decodetree.py b/scripts/decodetree.py
> index 0bc73b5990..5dea15e7a5 100755
> --- a/scripts/decodetree.py
> +++ b/scripts/decodetree.py
> @@ -1030,7 +1030,11 @@ def main():
>  else:
>  output_fd = sys.stdout
>  
> +hdr_guard = filename.split(os.path.sep)[-1].upper().replace('.', '_') + 
> "_H"

Isn't
  filename.split(os.path.sep)[-1]
equivalent to
  os.path.basename(filename)
?


> +hdr_guard = filename.split(os.path.sep)[-1].upper().replace('.', '_') + 
> "_H"
> +
>  output_autogen()
> +output('#ifndef ' + hdr_guard + '\n')
> +output('#define ' + hdr_guard + '\n')
>  for n in sorted(arguments.keys()):
>  f = arguments[n]
>  f.output_def()
> @@ -1066,6 +1070,7 @@ def main():
>  t.output_code(4, False, 0, 0)
>  
>  output('}\n')
> +output('#endif /* ' + hdr_guard + ' */\n')
>  
>  if output_file:
>  output_fd.close()
> -- 
> 2.17.2
> 

-- 
Eduardo



Re: [Qemu-devel] [PATCH v2 03/11] block: Filtered children access functions

2018-11-12 Thread Eric Blake

On 8/9/18 5:31 PM, Max Reitz wrote:

What bs->file and bs->backing mean depends on the node.  For filter
nodes, both signify a node that will eventually receive all R/W
accesses.  For format nodes, bs->file contains metadata and data, and
bs->backing will not receive writes -- instead, writes are COWed to
bs->file.  Usually.

In any case, it is not trivial to guess what a child means exactly with
our currently limited form of expression.  It is better to introduce
some functions that actually guarantee a meaning:

- bdrv_filtered_cow_child() will return the child that receives requests
   filtered through COW.  That is, reads may or may not be forwarded
   (depending on the overlay's allocation status), but writes never go to
   this child.

- bdrv_filtered_rw_child() will return the child that receives requests
   filtered through some very plain process.  Reads and writes issued to
   the parent will go to the child as well (although timing, etc. may be
   modified).

- All drivers but quorum (but quorum is pretty opaque to the general
   block layer anyway) always only have one of these children: All read
   requests must be served from the filtered_rw_child (if it exists), so
   if there was a filtered_cow_child in addition, it would not receive
   any requests at all.
   (The closest here is mirror, where all requests are passed on to the
   source, but with write-blocking, write requests are "COWed" to the
   target.  But that just means that the target is a special child that
   cannot be introspected by the generic block layer functions, and that
   source is a filtered_rw_child.)
   Therefore, we can also add bdrv_filtered_child() which returns that
   one child (or NULL, if there is no filtered child).

Also, many places in the current block layer should be skipping filters
(all filters or just the ones added implicitly, it depends) when going
through a block node chain.  They do not do that currently, but this
patch makes them.


The description makes sense; now on to the code.



Signed-off-by: Max Reitz 
---
  qapi/block-core.json   |   4 +
  include/block/block.h  |   1 +
  include/block/block_int.h  |  33 +-
  block.c| 184 -
  block/backup.c |   8 +-
  block/block-backend.c  |  16 ++-
  block/commit.c |  36 ---
  block/io.c |  27 ++---
  block/mirror.c |  37 ---
  block/qapi.c   |  26 ++---
  block/stream.c |  15 ++-
  blockdev.c |  84 ---
  migration/block-dirty-bitmap.c |   4 +-
  nbd/server.c   |   8 +-
  qemu-img.c |  12 ++-
  15 files changed, 363 insertions(+), 132 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index f20efc97f7..a71df88eb2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2248,6 +2248,10 @@
  # On successful completion the image file is updated to drop the backing file
  # and the BLOCK_JOB_COMPLETED event is emitted.


Context: this is part of block-stream.


  #
+# In case @device is a filter node, block-stream modifies the first non-filter
+# overlay node below it to point to base's backing node (or NULL if @base was
+# not specified) instead of modifying @device itself.


That is, if we have:

base <- filter1 <- active <- filter2

and request a block-stream with "top":"filter2", it is no different in 
effect than if we had requested "top":"active", since filter nodes can't 
be stream targets.  Makes sense.


What happens if we request "base":"filter1"? Do we want to require base 
to be a non-filter node?



+++ b/include/block/block_int.h
@@ -91,6 +91,7 @@ struct BlockDriver {
   * certain callbacks that refer to data (see block.c) to their bs->file if
   * the driver doesn't implement them. Drivers that do not wish to forward
   * must implement them and return -ENOTSUP.
+ * Note that filters are not allowed to modify data.


They can modify offsets and timing, but not data?  Even if it is an 
encryption filter?  I'm trying to figure out if LUKS behaves like a filter.



+++ b/block.c
@@ -532,11 +532,12 @@ int bdrv_create_file(const char *filename, QemuOpts 
*opts, Error **errp)
  int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
  {
  BlockDriver *drv = bs->drv;
+BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);


Is it worth a micro-optimization of not calling this...

  
  if (drv && drv->bdrv_probe_blocksizes) {

  return drv->bdrv_probe_blocksizes(bs, bsz);


...until after checking drv->bdrv_probe_blocksizes?


-} else if (drv && drv->is_filter && bs->file) {
-return bdrv_probe_blocksizes(bs->file->bs, bsz);
+} else if (filtered) {
+return bdrv_probe_blocksizes(filtered, bsz);
  }


But I don't mind if you leave it as written.

Is blkdebug a filter, or something else?  That's a 

Re: [Qemu-devel] [PULL 40/55] target/arm: Enable SVE for aarch64-linux-user

2018-11-12 Thread Alex Bennée


Laurent Vivier  writes:

> On 29/06/2018 16:53, Peter Maydell wrote:
>> From: Richard Henderson 
>>
>> Enable ARM_FEATURE_SVE for the generic "max" cpu.
>>
>> Tested-by: Alex Bennée 
>> Reviewed-by: Peter Maydell 
>> Signed-off-by: Richard Henderson 
>> Message-id: 20180627043328.11531-35-richard.hender...@linaro.org
>> Signed-off-by: Peter Maydell 
>> ---
>>  linux-user/elfload.c | 1 +
>>  target/arm/cpu.c | 7 +++
>>  target/arm/cpu64.c   | 1 +
>>  3 files changed, 9 insertions(+)
>>
>> diff --git a/linux-user/elfload.c b/linux-user/elfload.c
>> index 13bc78d0c86..d1231ad07a3 100644
>> --- a/linux-user/elfload.c
>> +++ b/linux-user/elfload.c
>> @@ -584,6 +584,7 @@ static uint32_t get_elf_hwcap(void)
>>  GET_FEATURE(ARM_FEATURE_V8_ATOMICS, ARM_HWCAP_A64_ATOMICS);
>>  GET_FEATURE(ARM_FEATURE_V8_RDM, ARM_HWCAP_A64_ASIMDRDM);
>>  GET_FEATURE(ARM_FEATURE_V8_FCMA, ARM_HWCAP_A64_FCMA);
>> +GET_FEATURE(ARM_FEATURE_SVE, ARM_HWCAP_A64_SVE);
>>  #undef GET_FEATURE
>>
>>  return hwcaps;
>> diff --git a/target/arm/cpu.c b/target/arm/cpu.c
>> index 2ae4fffafb9..6dcc552e143 100644
>> --- a/target/arm/cpu.c
>> +++ b/target/arm/cpu.c
>> @@ -164,6 +164,13 @@ static void arm_cpu_reset(CPUState *s)
>>  env->cp15.sctlr_el[1] |= SCTLR_UCT | SCTLR_UCI | SCTLR_DZE;
>>  /* and to the FP/Neon instructions */
>>  env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 20, 2, 3);
>> +/* and to the SVE instructions */
>> +env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 16, 2, 3);
>> +env->cp15.cptr_el[3] |= CPTR_EZ;
>> +/* with maximum vector length */
>> +env->vfp.zcr_el[1] = ARM_MAX_VQ - 1;
>> +env->vfp.zcr_el[2] = ARM_MAX_VQ - 1;
>> +env->vfp.zcr_el[3] = ARM_MAX_VQ - 1;
>>  #else
>>  /* Reset into the highest available EL */
>>  if (arm_feature(env, ARM_FEATURE_EL3)) {
>> diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
>> index c50dcd4077d..0360d7efc5e 100644
>> --- a/target/arm/cpu64.c
>> +++ b/target/arm/cpu64.c
>> @@ -252,6 +252,7 @@ static void aarch64_max_initfn(Object *obj)
>>  set_feature(>env, ARM_FEATURE_V8_RDM);
>>  set_feature(>env, ARM_FEATURE_V8_FP16);
>>  set_feature(>env, ARM_FEATURE_V8_FCMA);
>> +set_feature(>env, ARM_FEATURE_SVE);
>>  /* For usermode -cpu max we can use a larger and more efficient DCZ
>>   * blocksize since we don't have to follow what the hardware does.
>>   */
>>
>
> Running some tests for my pull request, I've found this commit breaks
> ltp-full-20180515 sigaltstack01 tests with ubuntu arm64/trusty.
>
> sigaltstack01  274  TBROK  :  tst_sig.c:233: unexpected signal
> SIGIOT/SIGABRT(6) received (pid = 15241).
> *** Error in `/opt/ltp/testcases/bin/sigaltstack01': free(): invalid
> pointer: 0x0042a010 ***

I wonder if that is the test case not handling the full frame size (or
us not checking the allocated size). What syscall or signal delivery was
happening at the time?

>
> Thanks,
> Laurent


--
Alex Bennée



[Qemu-devel] [PATCH for-4.0 07/17] tcg/i386: Use TCG_TARGET_NEED_LDST_OOL_LABELS

2018-11-12 Thread Richard Henderson
Move the entire memory operation out of line.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.h |   2 +-
 tcg/i386/tcg-target.inc.c | 401 --
 2 files changed, 171 insertions(+), 232 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 9fdf37f23c..c2d84cf1d2 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -224,7 +224,7 @@ static inline void tb_target_set_jmp_target(uintptr_t 
tc_ptr,
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
 #ifdef CONFIG_SOFTMMU
-#define TCG_TARGET_NEED_LDST_LABELS
+#define TCG_TARGET_NEED_LDST_OOL_LABELS
 #endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 16d5af76ad..1833f4c2b2 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1619,7 +1619,7 @@ static void tcg_out_nopn(TCGContext *s, int n)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "tcg-ldst-ool.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
@@ -1632,6 +1632,14 @@ static void * const qemu_ld_helpers[16] = {
 [MO_BEUW] = helper_be_lduw_mmu,
 [MO_BEUL] = helper_be_ldul_mmu,
 [MO_BEQ]  = helper_be_ldq_mmu,
+
+[MO_SB]   = helper_ret_ldsb_mmu,
+[MO_LESW] = helper_le_ldsw_mmu,
+[MO_BESW] = helper_be_ldsw_mmu,
+#if TCG_TARGET_REG_BITS == 64
+[MO_LESL] = helper_le_ldsl_mmu,
+[MO_BESL] = helper_be_ldsl_mmu,
+#endif
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1741,18 +1749,18 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 }
 
 /* jne slow_path */
-tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);
 label_ptr[0] = s->code_ptr;
-s->code_ptr += 4;
+s->code_ptr += 1;
 
 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
 /* cmp 4(r0), addrhi */
 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
 
 /* jne slow_path */
-tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);
 label_ptr[1] = s->code_ptr;
-s->code_ptr += 4;
+s->code_ptr += 1;
 }
 
 /* TLB Hit.  */
@@ -1764,181 +1772,6 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 return base;
 }
 
-/*
- * Record the context of a call to the out of line helper code for the slow 
path
- * for a load or store, so that we can later generate the correct helper code
- */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
-TCGReg datalo, TCGReg datahi,
-TCGReg addrlo, TCGReg addrhi,
-tcg_insn_unit *raddr,
-tcg_insn_unit **label_ptr)
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->datalo_reg = datalo;
-label->datahi_reg = datahi;
-label->addrlo_reg = addrlo;
-label->addrhi_reg = addrhi;
-label->raddr = raddr;
-label->label_ptr[0] = label_ptr[0];
-if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-label->label_ptr[1] = label_ptr[1];
-}
-}
-
-/*
- * Generate code for the slow path for a load at the end of block
- */
-static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-TCGMemOpIdx oi = l->oi;
-TCGMemOp opc = get_memop(oi);
-TCGReg data_reg;
-tcg_insn_unit **label_ptr = >label_ptr[0];
-
-/* resolve label address */
-tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
-if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
-}
-
-if (TCG_TARGET_REG_BITS == 32) {
-int ofs = 0;
-
-tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-ofs += 4;
-
-tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (TARGET_LONG_BITS == 64) {
-tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-}
-
-tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
-ofs += 4;
-
-tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
-} else {
-tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-/* The second argument is already loaded with addrlo.  */
-tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
-tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
- (uintptr_t)l->raddr);
-}
-
-tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-
-data_reg = l->datalo_reg;
-switch (opc & MO_SSIZE) {
-case MO_SB:
-

[Qemu-devel] [PATCH for-4.0 16/17] tcg/arm: Force qemu_ld/st arguments into fixed registers

2018-11-12 Thread Richard Henderson
This is an incremental step toward moving the qemu_ld/st
code sequence out of line.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.inc.c | 116 +--
 1 file changed, 75 insertions(+), 41 deletions(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 2deeb1f5d1..75589b43e2 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -270,38 +270,15 @@ static const char 
*target_parse_constraint(TCGArgConstraint *ct,
 ct->u.regs = 0x;
 break;
 
-/* qemu_ld address */
-case 'l':
-ct->ct |= TCG_CT_REG;
-ct->u.regs = 0x;
-#ifdef CONFIG_SOFTMMU
-/* r0-r2,lr will be overwritten when reading the tlb entry,
-   so don't use these. */
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
-#endif
-break;
-
+#ifndef CONFIG_SOFTMMU
 /* qemu_st address & data */
 case 's':
 ct->ct |= TCG_CT_REG;
 ct->u.regs = 0x;
-/* r0-r2 will be overwritten when reading the tlb entry (softmmu only)
-   and r0-r1 doing the byte swapping, so don't use these. */
+/* r0 and tmp are needed for byte swapping.  */
 tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1);
-#if defined(CONFIG_SOFTMMU)
-/* Avoid clashes with registers being used for helper args */
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2);
-#if TARGET_LONG_BITS == 64
-/* Avoid clashes with registers being used for helper args */
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3);
-#endif
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14);
-#endif
 break;
+#endif
 
 default:
 return NULL;
@@ -1630,8 +1607,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is64)
 TCGMemOpIdx oi;
 TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
-int mem_index;
-TCGReg addend;
+int mem_index, avail;
+TCGReg addend, t0, t1;
 tcg_insn_unit *label_ptr;
 #endif
 
@@ -1644,8 +1621,20 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is64)
 
 #ifdef CONFIG_SOFTMMU
 mem_index = get_mmuidx(oi);
+
+avail = 0xf;
+avail &= ~(1 << addrlo);
+if (TARGET_LONG_BITS == 64) {
+avail &= ~(1 << addrhi);
+}
+tcg_debug_assert(avail & 1);
+t0 = TCG_REG_R0;
+avail &= ~1;
+tcg_debug_assert(avail != 0);
+t1 = ctz32(avail);
+
 addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 1,
-  TCG_REG_R0, TCG_REG_R1, TCG_REG_TMP);
+  t0, t1, TCG_REG_TMP);
 
 /* This a conditional BL only to load a pointer within this opcode into LR
for the slow path.  We will not be using the value for a tail call.  */
@@ -1762,8 +1751,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, bool is64)
 TCGMemOpIdx oi;
 TCGMemOp opc;
 #ifdef CONFIG_SOFTMMU
-int mem_index;
-TCGReg addend;
+int mem_index, avail;
+TCGReg addend, t0, t1;
 tcg_insn_unit *label_ptr;
 #endif
 
@@ -1776,8 +1765,24 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg 
*args, bool is64)
 
 #ifdef CONFIG_SOFTMMU
 mem_index = get_mmuidx(oi);
+
+avail = 0xf;
+avail &= ~(1 << addrlo);
+avail &= ~(1 << datalo);
+if (TARGET_LONG_BITS == 64) {
+avail &= ~(1 << addrhi);
+}
+if (is64) {
+avail &= ~(1 << datahi);
+}
+tcg_debug_assert(avail & 1);
+t0 = TCG_REG_R0;
+avail &= ~1;
+tcg_debug_assert(avail != 0);
+t1 = ctz32(avail);
+
 addend = tcg_out_tlb_read(s, addrlo, addrhi, opc, mem_index, 0,
-  TCG_REG_R0, TCG_REG_R1, TCG_REG_TMP);
+  t0, t1, TCG_REG_TMP);
 
 tcg_out_qemu_st_index(s, COND_EQ, opc, datalo, datahi, addrlo, addend);
 
@@ -2118,11 +2123,14 @@ static const TCGTargetOpDef 
*tcg_target_op_def(TCGOpcode op)
 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
 static const TCGTargetOpDef s_s = { .args_ct_str = { "s", "s" } };
-static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
+static const TCGTargetOpDef a_b = { .args_ct_str = { "a", "b" } };
+static const TCGTargetOpDef c_b = { .args_ct_str = { "c", "b" } };
 static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
-static const TCGTargetOpDef r_r_l = { .args_ct_str = { "r", "r", "l" } };
-static const TCGTargetOpDef r_l_l = { .args_ct_str = { "r", "l", "l" } };
 static const TCGTargetOpDef s_s_s = { .args_ct_str = { "s", "s", "s" } };
+static const TCGTargetOpDef a_c_d = { .args_ct_str = { "a", "c", "d" } };
+static const TCGTargetOpDef 

[Qemu-devel] [PATCH for-4.0 13/17] tcg/arm: Parameterize the temps for tcg_out_tlb_read

2018-11-12 Thread Richard Henderson
When moving the qemu_ld/st arguments to the right place for
a function call, we'll need to move the temps out of the way.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.inc.c | 89 +---
 1 file changed, 46 insertions(+), 43 deletions(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 80d174ef44..414c91c9ea 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1245,11 +1245,14 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg 
argreg,
 /* We're expecting to use an 8-bit immediate and to mask.  */
 QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
 
-/* Load and compare a TLB entry, leaving the flags set.  Returns the register
-   containing the addend of the tlb entry.  Clobbers R0, R1, R2, TMP.  */
-
+/*
+ *Load and compare a TLB entry, leaving the flags set.  Returns the register
+ * containing the addend of the tlb entry.  Clobbers t0, t1, t2, t3.
+ * T0 and T1 must be consecutive for LDRD.
+ */
 static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
-   TCGMemOp opc, int mem_index, bool is_load)
+   TCGMemOp opc, int mem_index, bool is_load,
+   TCGReg t0, TCGReg t1, TCGReg t2, TCGReg t3)
 {
 TCGReg base = TCG_AREG0;
 int cmp_off =
@@ -1262,36 +1265,37 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 unsigned a_bits = get_alignment_bits(opc);
 
 /* V7 generates the following:
- *   ubfx   r0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
- *   addr2, env, #high
- *   addr2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
- *   ldrr0, [r2, #cmp]
- *   ldrr2, [r2, #add]
- *   movw   tmp, #page_align_mask
- *   bictmp, addrlo, tmp
- *   cmpr0, tmp
+ *   ubfx   t0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
+ *   addt2, env, #high
+ *   addt2, t2, r0, lsl #CPU_TLB_ENTRY_BITS
+ *   ldrt0, [t2, #cmp]  (and t1 w/ldrd)
+ *   ldrt2, [t2, #add]
+ *   movw   t3, #page_align_mask
+ *   bict3, addrlo, t3
+ *   cmpt0, t3
  *
  * Otherwise we generate:
- *   shrtmp, addrlo, #TARGET_PAGE_BITS
- *   addr2, env, #high
- *   andr0, tmp, #(CPU_TLB_SIZE - 1)
- *   addr2, r2, r0, lsl #CPU_TLB_ENTRY_BITS
- *   ldrr0, [r2, #cmp]
- *   ldrr2, [r2, #add]
+ *   shrt3, addrlo, #TARGET_PAGE_BITS
+ *   addt2, env, #high
+ *   andt0, t3, #(CPU_TLB_SIZE - 1)
+ *   addt2, t2, t0, lsl #CPU_TLB_ENTRY_BITS
+ *   ldrt0, [t2, #cmp]  (and t1 w/ldrd)
+ *   ldrt2, [t2, #add]
  *   tstaddrlo, #s_mask
- *   cmpeq  r0, tmp, lsl #TARGET_PAGE_BITS
+ *   cmpeq  t0, t3, lsl #TARGET_PAGE_BITS
  */
 if (use_armv7_instructions) {
-tcg_out_extract(s, COND_AL, TCG_REG_R0, addrlo,
+tcg_out_extract(s, COND_AL, t0, addrlo,
 TARGET_PAGE_BITS, CPU_TLB_BITS);
 } else {
-tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP,
+tcg_out_dat_reg(s, COND_AL, ARITH_MOV, t3,
 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
 }
 
 /* Add portions of the offset until the memory access is in range.
  * If we plan on using ldrd, reduce to an 8-bit offset; otherwise
- * we can use a 12-bit offset.  */
+ * we can use a 12-bit offset.
+ */
 if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
 mask_off = 0xff;
 } else {
@@ -1301,34 +1305,33 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 int shift = ctz32(cmp_off & ~mask_off) & ~1;
 int rot = ((32 - shift) << 7) & 0xf00;
 int addend = cmp_off & (0xff << shift);
-tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
+tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t2, base,
 rot | ((cmp_off >> shift) & 0xff));
-base = TCG_REG_R2;
+base = t2;
 add_off -= addend;
 cmp_off -= addend;
 }
 
 if (!use_armv7_instructions) {
-tcg_out_dat_imm(s, COND_AL, ARITH_AND,
-TCG_REG_R0, TCG_REG_TMP, CPU_TLB_SIZE - 1);
+tcg_out_dat_imm(s, COND_AL, ARITH_AND, t0, t3, CPU_TLB_SIZE - 1);
 }
-tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_R2, base,
-TCG_REG_R0, SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
+tcg_out_dat_reg(s, COND_AL, ARITH_ADD, t2, base, t0,
+SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
 
 /* Load the tlb comparator.  Use ldrd if needed and available,
but due to how the pointer needs setting up, ldm isn't useful.
Base arm5 doesn't have ldrd, but armv5te does.  */
 if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off);
+tcg_out_ldrd_8(s, COND_AL, t0, t2, cmp_off);

[Qemu-devel] [PATCH for-4.0 15/17] tcg/arm: Reduce the number of temps for tcg_out_tlb_read

2018-11-12 Thread Richard Henderson
When moving the qemu_ld/st thunk out of line, we no longer have LR for
use as a temporary.  In the worst case we must make do with 3 temps,
when dealing with a 64-bit guest address.  This in turn imples that we
cannot use LDRD anymore, as there are not enough temps.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.inc.c | 97 ++--
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 4339c472e8..2deeb1f5d1 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1251,13 +1251,12 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg 
argreg,
 QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8);
 
 /*
- *Load and compare a TLB entry, leaving the flags set.  Returns the register
- * containing the addend of the tlb entry.  Clobbers t0, t1, t2, t3.
- * T0 and T1 must be consecutive for LDRD.
+ * Load and compare a TLB entry, leaving the flags set.  Returns the register
+ * containing the addend of the tlb entry.  Clobbers t0, t1, t2.
  */
 static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
TCGMemOp opc, int mem_index, bool is_load,
-   TCGReg t0, TCGReg t1, TCGReg t2, TCGReg t3)
+   TCGReg t0, TCGReg t1, TCGReg t2)
 {
 TCGReg base = TCG_AREG0;
 int cmp_off =
@@ -1265,49 +1264,64 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
  ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
  : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write));
 int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend);
-int mask_off;
 unsigned s_bits = opc & MO_SIZE;
 unsigned a_bits = get_alignment_bits(opc);
 
 /* V7 generates the following:
  *   ubfx   t0, addrlo, #TARGET_PAGE_BITS, #CPU_TLB_BITS
  *   addt2, env, #high
- *   addt2, t2, r0, lsl #CPU_TLB_ENTRY_BITS
- *   ldrt0, [t2, #cmp]  (and t1 w/ldrd)
+ *   addt2, t2, t0, lsl #CPU_TLB_ENTRY_BITS
+ *   ldrt0, [t2, #cmp]
  *   ldrt2, [t2, #add]
- *   movw   t3, #page_align_mask
- *   bict3, addrlo, t3
- *   cmpt0, t3
+ *   movw   t1, #page_align_mask
+ *   bict1, addrlo, t1
+ *   cmpt0, t1
+ *
+ *   ubfx   t0, addrlo, #TPB, #CTB   -- 64-bit address
+ *   addt2, env, #high
+ *   addt2, t2, t0, lsl #CTEB
+ *   ldrt0, [t2, #cmplo]
+ *   movw   t1, #page_align_mask
+ *   bict1, addrlo, t1
+ *   cmpt0, t1
+ *   ldrt0, [t2, #cmphi]
+ *   ldrt2, [t2, #add]
+ *   cmpeq  t0, addrhi
  *
  * Otherwise we generate:
  *   shrt3, addrlo, #TARGET_PAGE_BITS
  *   addt2, env, #high
  *   andt0, t3, #(CPU_TLB_SIZE - 1)
  *   addt2, t2, t0, lsl #CPU_TLB_ENTRY_BITS
- *   ldrt0, [t2, #cmp]  (and t1 w/ldrd)
+ *   ldrt0, [t2, #cmp]
  *   ldrt2, [t2, #add]
  *   tstaddrlo, #s_mask
  *   cmpeq  t0, t3, lsl #TARGET_PAGE_BITS
+ *
+ *   shrt1, addrlo, #TPB -- 64-bit address
+ *   addt2, env, #high
+ *   andt0, t1, #CTS-1
+ *   addt2, t2, t0, lsl #CTEB
+ *   ldrt0, [t2, #cmplo]
+ *   tstaddrlo, #s_mask
+ *   cmpeq  t0, t1, lsl #TBP
+ *   ldrt0, [t2, #cmphi]
+ *   ldrt2, [t2, #add]
+ *   cmpeq  t0, addrhi
  */
 if (use_armv7_instructions) {
 tcg_out_extract(s, COND_AL, t0, addrlo,
 TARGET_PAGE_BITS, CPU_TLB_BITS);
 } else {
-tcg_out_dat_reg(s, COND_AL, ARITH_MOV, t3,
+tcg_out_dat_reg(s, COND_AL, ARITH_MOV, t1,
 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS));
 }
 
 /* Add portions of the offset until the memory access is in range.
- * If we plan on using ldrd, reduce to an 8-bit offset; otherwise
- * we can use a 12-bit offset.
+ * We are not using ldrd, so we can use a 12-bit offset.
  */
-if (use_armv6_instructions && TARGET_LONG_BITS == 64) {
-mask_off = 0xff;
-} else {
-mask_off = 0xfff;
-}
-while (cmp_off > mask_off) {
-int shift = ctz32(cmp_off & ~mask_off) & ~1;
+while (cmp_off > 0xfff) {
+int shift = ctz32(cmp_off & ~0xfff) & ~1;
 int rot = ((32 - shift) << 7) & 0xf00;
 int addend = cmp_off & (0xff << shift);
 tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t2, base,
@@ -1318,25 +1332,13 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 }
 
 if (!use_armv7_instructions) {
-tcg_out_dat_imm(s, COND_AL, ARITH_AND, t0, t3, CPU_TLB_SIZE - 1);
+tcg_out_dat_imm(s, COND_AL, ARITH_AND, t0, t1, CPU_TLB_SIZE - 1);
 }
 tcg_out_dat_reg(s, COND_AL, ARITH_ADD, t2, base, t0,
 SHIFT_IMM_LSL(CPU_TLB_ENTRY_BITS));
 
-/* 

[Qemu-devel] [PATCH for-4.0 09/17] tcg/aarch64: Parameterize the temps for tcg_out_tlb_read

2018-11-12 Thread Richard Henderson
When moving the qemu_ld/st arguments to the right place for
a function call, we'll need to move the temps out of the way.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.inc.c | 74 +++-
 1 file changed, 40 insertions(+), 34 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 148de0b7f2..c0ba9a6d50 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -1467,13 +1467,15 @@ static void add_qemu_ldst_label(TCGContext *s, bool 
is_ld, TCGMemOpIdx oi,
 label->label_ptr[0] = label_ptr;
 }
 
-/* Load and compare a TLB entry, emitting the conditional jump to the
-   slow path for the failure case, which will be patched later when finalizing
-   the slow path. Generated code returns the host addend in X1,
-   clobbers X0,X2,X3,TMP. */
-static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
- tcg_insn_unit **label_ptr, int mem_index,
- bool is_read)
+/*
+ * Load and compare a TLB entry, emitting the conditional jump to the
+ * slow path on failure.  Returns the register for the host addend.
+ * Clobbers t0, t1, t2, t3.
+ */
+static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
+   tcg_insn_unit **label_ptr, int mem_index,
+   bool is_read, TCGReg t0, TCGReg t1,
+   TCGReg t2, TCGReg t3)
 {
 int tlb_offset = is_read ?
 offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)
@@ -1491,55 +1493,56 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg 
addr_reg, TCGMemOp opc,
 if (a_bits >= s_bits) {
 x3 = addr_reg;
 } else {
+x3 = t3;
 tcg_out_insn(s, 3401, ADDI, TARGET_LONG_BITS == 64,
- TCG_REG_X3, addr_reg, s_mask - a_mask);
-x3 = TCG_REG_X3;
+ x3, addr_reg, s_mask - a_mask);
 }
 tlb_mask = (uint64_t)TARGET_PAGE_MASK | a_mask;
 
-/* Extract the TLB index from the address into X0.
-   X0 =
+/* Extract the TLB index from the address into T0.
+   T0 =
addr_reg */
-tcg_out_ubfm(s, TARGET_LONG_BITS == 64, TCG_REG_X0, addr_reg,
+tcg_out_ubfm(s, TARGET_LONG_BITS == 64, t0, addr_reg,
  TARGET_PAGE_BITS, TARGET_PAGE_BITS + CPU_TLB_BITS);
 
-/* Store the page mask part of the address into X3.  */
+/* Store the page mask part of the address into T3.  */
 tcg_out_logicali(s, I3404_ANDI, TARGET_LONG_BITS == 64,
- TCG_REG_X3, x3, tlb_mask);
+ t3, x3, tlb_mask);
 
-/* Add any "high bits" from the tlb offset to the env address into X2,
+/* Add any "high bits" from the tlb offset to the env address into T2,
to take advantage of the LSL12 form of the ADDI instruction.
-   X2 = env + (tlb_offset & 0xfff000) */
+   T2 = env + (tlb_offset & 0xfff000) */
 if (tlb_offset & 0xfff000) {
-tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_X2, base,
+tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, t2, base,
  tlb_offset & 0xfff000);
-base = TCG_REG_X2;
+base = t2;
 }
 
-/* Merge the tlb index contribution into X2.
-   X2 = X2 + (X0 << CPU_TLB_ENTRY_BITS) */
-tcg_out_insn(s, 3502S, ADD_LSL, TCG_TYPE_I64, TCG_REG_X2, base,
- TCG_REG_X0, CPU_TLB_ENTRY_BITS);
+/* Merge the tlb index contribution into T2.
+   T2 = T2 + (T0 << CPU_TLB_ENTRY_BITS) */
+tcg_out_insn(s, 3502S, ADD_LSL, TCG_TYPE_I64,
+ t2, base, t0, CPU_TLB_ENTRY_BITS);
 
-/* Merge "low bits" from tlb offset, load the tlb comparator into X0.
-   X0 = load [X2 + (tlb_offset & 0x000fff)] */
+/* Merge "low bits" from tlb offset, load the tlb comparator into T0.
+   T0 = load [T2 + (tlb_offset & 0x000fff)] */
 tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
- TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
- TARGET_LONG_BITS == 32 ? 2 : 3);
+ t0, t2, tlb_offset & 0xfff, TARGET_LONG_BITS == 32 ? 2 : 3);
 
 /* Load the tlb addend. Do that early to avoid stalling.
-   X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
-tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
+   T1 = load [T2 + (tlb_offset & 0xfff) + offsetof(addend)] */
+tcg_out_ldst(s, I3312_LDRX, t1, t2,
  (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
  (is_read ? offsetof(CPUTLBEntry, addr_read)
   : offsetof(CPUTLBEntry, addr_write)), 3);
 
 /* Perform the address comparison. */
-tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
+tcg_out_cmp(s, (TARGET_LONG_BITS == 64), t0, t3, 0);
 
 /* If not equal, we jump to the slow path. */
 *label_ptr = s->code_ptr;
 tcg_out_goto_cond_noaddr(s, 

[Qemu-devel] [PATCH for-4.0 17/17] tcg/arm: Use TCG_TARGET_NEED_LDST_OOL_LABELS

2018-11-12 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.h |   2 +-
 tcg/arm/tcg-target.inc.c | 302 +++
 2 files changed, 118 insertions(+), 186 deletions(-)

diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 94b3578c55..02981abdcc 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -141,7 +141,7 @@ static inline void flush_icache_range(uintptr_t start, 
uintptr_t stop)
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t);
 
 #ifdef CONFIG_SOFTMMU
-#define TCG_TARGET_NEED_LDST_LABELS
+#define TCG_TARGET_NEED_LDST_OOL_LABELS
 #endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 75589b43e2..1e9ff693d9 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1134,7 +1134,7 @@ static TCGCond tcg_out_cmp2(TCGContext *s, const TCGArg 
*args,
 }
 
 #ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "tcg-ldst-ool.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
@@ -1358,127 +1358,6 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
 return t2;
 }
 
-/* Record the context of a call to the out of line helper code for the slow
-   path for a load or store, so that we can later generate the correct
-   helper code.  */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
-TCGReg datalo, TCGReg datahi, TCGReg addrlo,
-TCGReg addrhi, tcg_insn_unit *raddr,
-tcg_insn_unit *label_ptr)
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->datalo_reg = datalo;
-label->datahi_reg = datahi;
-label->addrlo_reg = addrlo;
-label->addrhi_reg = addrhi;
-label->raddr = raddr;
-label->label_ptr[0] = label_ptr;
-}
-
-static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-{
-TCGReg argreg, datalo, datahi;
-TCGMemOpIdx oi = lb->oi;
-TCGMemOp opc = get_memop(oi);
-void *func;
-
-reloc_pc24(lb->label_ptr[0], s->code_ptr);
-
-argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0);
-if (TARGET_LONG_BITS == 64) {
-argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
-} else {
-argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
-}
-argreg = tcg_out_arg_imm32(s, argreg, oi);
-argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14);
-
-/* For armv6 we can use the canonical unsigned helpers and minimize
-   icache usage.  For pre-armv6, use the signed helpers since we do
-   not have a single insn sign-extend.  */
-if (use_armv6_instructions) {
-func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)];
-} else {
-func = qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)];
-if (opc & MO_SIGN) {
-opc = MO_UL;
-}
-}
-tcg_out_call(s, func);
-
-datalo = lb->datalo_reg;
-datahi = lb->datahi_reg;
-switch (opc & MO_SSIZE) {
-case MO_SB:
-tcg_out_ext8s(s, COND_AL, datalo, TCG_REG_R0);
-break;
-case MO_SW:
-tcg_out_ext16s(s, COND_AL, datalo, TCG_REG_R0);
-break;
-default:
-tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-break;
-case MO_Q:
-if (datalo != TCG_REG_R1) {
-tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-} else if (datahi != TCG_REG_R0) {
-tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_R0);
-} else {
-tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0);
-tcg_out_mov_reg(s, COND_AL, datahi, TCG_REG_R1);
-tcg_out_mov_reg(s, COND_AL, datalo, TCG_REG_TMP);
-}
-break;
-}
-
-tcg_out_goto(s, COND_AL, lb->raddr);
-}
-
-static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-{
-TCGReg argreg, datalo, datahi;
-TCGMemOpIdx oi = lb->oi;
-TCGMemOp opc = get_memop(oi);
-
-reloc_pc24(lb->label_ptr[0], s->code_ptr);
-
-argreg = TCG_REG_R0;
-argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0);
-if (TARGET_LONG_BITS == 64) {
-argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, lb->addrhi_reg);
-} else {
-argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg);
-}
-
-datalo = lb->datalo_reg;
-datahi = lb->datahi_reg;
-switch (opc & MO_SIZE) {
-case MO_8:
-argreg = tcg_out_arg_reg8(s, argreg, datalo);
-break;
-case MO_16:
-argreg = tcg_out_arg_reg16(s, argreg, datalo);
-break;
-case MO_32:
-default:
-argreg = tcg_out_arg_reg32(s, argreg, datalo);
-break;
-case MO_64:
-

[Qemu-devel] [PATCH for-4.0 11/17] tcg/aarch64: Use B not BL for tcg_out_goto_long

2018-11-12 Thread Richard Henderson
This was a typo copying from tcg_out_call, apparently.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.inc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index ea5fe33fca..403f5caf14 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -1139,7 +1139,7 @@ static inline void tcg_out_goto_long(TCGContext *s, 
tcg_insn_unit *target,
 {
 ptrdiff_t offset = target - s->code_ptr;
 if (offset == sextract64(offset, 0, 26)) {
-tcg_out_insn(s, 3206, BL, offset);
+tcg_out_insn(s, 3206, B, offset);
 } else {
 tcg_out_movi(s, TCG_TYPE_I64, scratch, (intptr_t)target);
 tcg_out_insn(s, 3207, BR, scratch);
-- 
2.17.2




[Qemu-devel] [PATCH for-4.0 14/17] tcg/arm: Add constraints for R0-R5

2018-11-12 Thread Richard Henderson
These are function call arguments that we will need soon.

Signed-off-by: Richard Henderson 
---
 tcg/arm/tcg-target.inc.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index 414c91c9ea..4339c472e8 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -246,7 +246,12 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
 {
-switch (*ct_str++) {
+char c = *ct_str++;
+switch (c) {
+case 'a' ... 'f': /* r0 - r5 */
+ct->ct |= TCG_CT_REG;
+tcg_regset_set_reg(ct->u.regs, TCG_REG_R0 + (c - 'a'));
+break;
 case 'I':
 ct->ct |= TCG_CT_CONST_ARM;
 break;
-- 
2.17.2




[Qemu-devel] [PATCH for-4.0 05/17] tcg: Return success from patch_reloc

2018-11-12 Thread Richard Henderson
This moves the assert for success from inside patch_reloc
to outside patch_reloc.  This touches all tcg backends.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.inc.c | 44 ++---
 tcg/arm/tcg-target.inc.c | 26 +---
 tcg/i386/tcg-target.inc.c| 17 +++--
 tcg/mips/tcg-target.inc.c| 29 +-
 tcg/ppc/tcg-target.inc.c | 47 ++--
 tcg/s390/tcg-target.inc.c| 37 +++-
 tcg/sparc/tcg-target.inc.c   | 13 ++
 tcg/tcg-pool.inc.c   |  5 +++-
 tcg/tcg.c|  8 +++---
 tcg/tci/tcg-target.inc.c |  3 ++-
 10 files changed, 125 insertions(+), 104 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 083592a4d7..30091f6a69 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -78,48 +78,40 @@ static const int tcg_target_call_oarg_regs[1] = {
 #define TCG_REG_GUEST_BASE TCG_REG_X28
 #endif
 
-static inline void reloc_pc26(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
+static inline bool reloc_pc26(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
 ptrdiff_t offset = target - code_ptr;
-tcg_debug_assert(offset == sextract64(offset, 0, 26));
-/* read instruction, mask away previous PC_REL26 parameter contents,
-   set the proper offset, then write back the instruction. */
-*code_ptr = deposit32(*code_ptr, 0, 26, offset);
+if (offset == sextract64(offset, 0, 26)) {
+/* read instruction, mask away previous PC_REL26 parameter contents,
+   set the proper offset, then write back the instruction. */
+*code_ptr = deposit32(*code_ptr, 0, 26, offset);
+return true;
+}
+return false;
 }
 
-static inline void reloc_pc26_atomic(tcg_insn_unit *code_ptr,
- tcg_insn_unit *target)
+static inline bool reloc_pc19(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
 ptrdiff_t offset = target - code_ptr;
-tcg_insn_unit insn;
-tcg_debug_assert(offset == sextract64(offset, 0, 26));
-/* read instruction, mask away previous PC_REL26 parameter contents,
-   set the proper offset, then write back the instruction. */
-insn = atomic_read(code_ptr);
-atomic_set(code_ptr, deposit32(insn, 0, 26, offset));
+if (offset == sextract64(offset, 0, 19)) {
+*code_ptr = deposit32(*code_ptr, 5, 19, offset);
+return true;
+}
+return false;
 }
 
-static inline void reloc_pc19(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
-{
-ptrdiff_t offset = target - code_ptr;
-tcg_debug_assert(offset == sextract64(offset, 0, 19));
-*code_ptr = deposit32(*code_ptr, 5, 19, offset);
-}
-
-static inline void patch_reloc(tcg_insn_unit *code_ptr, int type,
+static inline bool patch_reloc(tcg_insn_unit *code_ptr, int type,
intptr_t value, intptr_t addend)
 {
 tcg_debug_assert(addend == 0);
 switch (type) {
 case R_AARCH64_JUMP26:
 case R_AARCH64_CALL26:
-reloc_pc26(code_ptr, (tcg_insn_unit *)value);
-break;
+return reloc_pc26(code_ptr, (tcg_insn_unit *)value);
 case R_AARCH64_CONDBR19:
-reloc_pc19(code_ptr, (tcg_insn_unit *)value);
-break;
+return reloc_pc19(code_ptr, (tcg_insn_unit *)value);
 default:
-tcg_abort();
+g_assert_not_reached();
 }
 }
 
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index e1fbf465cb..80d174ef44 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -187,27 +187,23 @@ static const uint8_t tcg_cond_to_arm_cond[] = {
 [TCG_COND_GTU] = COND_HI,
 };
 
-static inline void reloc_pc24(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
+static inline bool reloc_pc24(tcg_insn_unit *code_ptr, tcg_insn_unit *target)
 {
 ptrdiff_t offset = (tcg_ptr_byte_diff(target, code_ptr) - 8) >> 2;
-*code_ptr = (*code_ptr & ~0xff) | (offset & 0xff);
+if (offset == sextract32(offset, 0, 24)) {
+*code_ptr = (*code_ptr & ~0xff) | (offset & 0xff);
+return true;
+}
+return false;
 }
 
-static inline void reloc_pc24_atomic(tcg_insn_unit *code_ptr, tcg_insn_unit 
*target)
-{
-ptrdiff_t offset = (tcg_ptr_byte_diff(target, code_ptr) - 8) >> 2;
-tcg_insn_unit insn = atomic_read(code_ptr);
-tcg_debug_assert(offset == sextract32(offset, 0, 24));
-atomic_set(code_ptr, deposit32(insn, 0, 24, offset));
-}
-
-static void patch_reloc(tcg_insn_unit *code_ptr, int type,
+static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 intptr_t value, intptr_t addend)
 {
 tcg_debug_assert(addend == 0);
 
 if (type == R_ARM_PC24) {
-reloc_pc24(code_ptr, (tcg_insn_unit *)value);
+return reloc_pc24(code_ptr, (tcg_insn_unit *)value);
 } else if (type == R_ARM_PC13) {
 intptr_t diff = value - 

[Qemu-devel] [PATCH for-4.0 08/17] tcg/aarch64: Add constraints for x0, x1, x2

2018-11-12 Thread Richard Henderson
These are function call arguments that we will need soon.

Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.inc.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 30091f6a69..148de0b7f2 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -125,6 +125,18 @@ static const char 
*target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
 {
 switch (*ct_str++) {
+case 'a': /* x0 */
+ct->ct |= TCG_CT_REG;
+tcg_regset_set_reg(ct->u.regs, TCG_REG_X0);
+break;
+case 'b': /* x1 */
+ct->ct |= TCG_CT_REG;
+tcg_regset_set_reg(ct->u.regs, TCG_REG_X1);
+break;
+case 'c': /* x2 */
+ct->ct |= TCG_CT_REG;
+tcg_regset_set_reg(ct->u.regs, TCG_REG_X2);
+break;
 case 'r': /* general registers */
 ct->ct |= TCG_CT_REG;
 ct->u.regs |= 0xu;
-- 
2.17.2




[Qemu-devel] [PATCH v5 11/14] hw/timer/nrf51_timer: Add nRF51 Timer peripheral

2018-11-12 Thread Steffen Görtz
This patch adds the model for the nRF51 timer peripheral.
Currently, only the TIMER mode is implemented.

Signed-off-by: Steffen Görtz 
---
 hw/timer/Makefile.objs |   1 +
 hw/timer/nrf51_timer.c | 337 +
 hw/timer/trace-events  |   5 +
 include/hw/timer/nrf51_timer.h |  81 
 4 files changed, 424 insertions(+)
 create mode 100644 hw/timer/nrf51_timer.c
 create mode 100644 include/hw/timer/nrf51_timer.h

diff --git a/hw/timer/Makefile.objs b/hw/timer/Makefile.objs
index b32194d153..0e9a4530f8 100644
--- a/hw/timer/Makefile.objs
+++ b/hw/timer/Makefile.objs
@@ -23,6 +23,7 @@ common-obj-$(CONFIG_IMX) += imx_gpt.o
 common-obj-$(CONFIG_LM32) += lm32_timer.o
 common-obj-$(CONFIG_MILKYMIST) += milkymist-sysctl.o
 common-obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp-rtc.o
+common-obj-$(CONFIG_NRF51_SOC) += nrf51_timer.o
 
 obj-$(CONFIG_ALTERA_TIMER) += altera_timer.o
 obj-$(CONFIG_EXYNOS4) += exynos4210_mct.o
diff --git a/hw/timer/nrf51_timer.c b/hw/timer/nrf51_timer.c
new file mode 100644
index 00..40a41ab22c
--- /dev/null
+++ b/hw/timer/nrf51_timer.c
@@ -0,0 +1,337 @@
+/*
+ * nRF51 System-on-Chip Timer peripheral
+ *
+ * Reference Manual: http://infocenter.nordicsemi.com/pdf/nRF51_RM_v3.0.pdf
+ * Product Spec: http://infocenter.nordicsemi.com/pdf/nRF51822_PS_v3.1.pdf
+ *
+ * Copyright 2018 Steffen Görtz 
+ *
+ * This code is licensed under the GPL version 2 or later.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "hw/arm/nrf51.h"
+#include "hw/timer/nrf51_timer.h"
+#include "trace.h"
+
+#define MINIMUM_PERIOD 1UL
+#define TIMER_TICK_PS 62500UL
+
+static uint32_t const bitwidths[] = {16, 8, 24, 32};
+
+static void set_prescaler(NRF51TimerState *s, uint32_t prescaler)
+{
+uint64_t period;
+s->prescaler = prescaler;
+
+period = ((1UL << s->prescaler) * TIMER_TICK_PS) / 1000;
+/* Limit minimum timeout period to 10us to allow some progress */
+if (period < MINIMUM_PERIOD) {
+s->tick_period = MINIMUM_PERIOD;
+s->counter_inc = MINIMUM_PERIOD / period;
+} else {
+s->tick_period = period;
+s->counter_inc = 1;
+}
+}
+
+static void update_irq(NRF51TimerState *s)
+{
+bool flag = false;
+size_t i;
+
+for (i = 0; i < NRF51_TIMER_REG_COUNT; i++) {
+flag |= s->events_compare[i] && extract32(s->inten, 16 + i, 1);
+}
+qemu_set_irq(s->irq, flag);
+}
+
+static void timer_expire(void *opaque)
+{
+NRF51TimerState *s = NRF51_TIMER(opaque);
+bool should_stop = false;
+uint32_t counter = s->counter;
+size_t i;
+uint64_t diff;
+
+if (s->running) {
+for (i = 0; i < NRF51_TIMER_REG_COUNT; i++) {
+if (counter < s->cc[i]) {
+diff = s->cc[i] - counter;
+} else {
+diff = (s->cc[i] + BIT(bitwidths[s->bitmode])) - counter;
+}
+
+if (diff <= s->counter_inc) {
+s->events_compare[i] = true;
+
+if (s->shorts & BIT(i)) {
+s->counter = 0;
+}
+
+should_stop |= s->shorts & BIT(i + 8);
+}
+}
+
+s->counter += s->counter_inc;
+s->counter &= (BIT(bitwidths[s->bitmode]) - 1);
+
+update_irq(s);
+
+if (should_stop) {
+s->running = false;
+timer_del(>timer);
+} else {
+s->time_offset += s->tick_period;
+timer_mod_ns(>timer, s->time_offset);
+}
+} else {
+timer_del(>timer);
+}
+}
+
+static void counter_compare(NRF51TimerState *s)
+{
+uint32_t counter = s->counter;
+size_t i;
+for (i = 0; i < NRF51_TIMER_REG_COUNT; i++) {
+if (counter == s->cc[i]) {
+s->events_compare[i] = true;
+
+if (s->shorts & BIT(i)) {
+s->counter = 0;
+}
+}
+}
+}
+
+static uint64_t nrf51_timer_read(void *opaque, hwaddr offset, unsigned int 
size)
+{
+NRF51TimerState *s = NRF51_TIMER(opaque);
+uint64_t r = 0;
+
+switch (offset) {
+case NRF51_TIMER_EVENT_COMPARE_0 ... NRF51_TIMER_EVENT_COMPARE_3:
+r = s->events_compare[(offset - NRF51_TIMER_EVENT_COMPARE_0) / 4];
+break;
+case NRF51_TIMER_REG_SHORTS:
+r = s->shorts;
+break;
+case NRF51_TIMER_REG_INTENSET:
+r = s->inten;
+break;
+case NRF51_TIMER_REG_INTENCLR:
+r = s->inten;
+break;
+case NRF51_TIMER_REG_MODE:
+r = s->mode;
+break;
+case NRF51_TIMER_REG_BITMODE:
+r = s->bitmode;
+break;
+case NRF51_TIMER_REG_PRESCALER:
+r = s->prescaler;
+break;
+case NRF51_TIMER_REG_CC0 ... NRF51_TIMER_REG_CC3:
+r = s->cc[(offset - NRF51_TIMER_REG_CC0) / 4];
+break;
+default:
+qemu_log_mask(LOG_GUEST_ERROR,
+"%s: bad read offset 

[Qemu-devel] [PATCH for-4.0 06/17] tcg: Add TCG_TARGET_NEED_LDST_OOL_LABELS

2018-11-12 Thread Richard Henderson
This variant of tcg-ldst.inc.c allows the entire thunk to be
moved out-of-line, with caching across TBs within a region.

Signed-off-by: Richard Henderson 
---
 tcg/tcg.h  |  4 ++
 tcg/tcg-ldst-ool.inc.c | 94 ++
 tcg/tcg.c  | 20 +
 3 files changed, 118 insertions(+)
 create mode 100644 tcg/tcg-ldst-ool.inc.c

diff --git a/tcg/tcg.h b/tcg/tcg.h
index f4efbaa680..1255d2a2c6 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -706,6 +706,10 @@ struct TCGContext {
 #ifdef TCG_TARGET_NEED_LDST_LABELS
 QSIMPLEQ_HEAD(ldst_labels, TCGLabelQemuLdst) ldst_labels;
 #endif
+#ifdef TCG_TARGET_NEED_LDST_OOL_LABELS
+QSIMPLEQ_HEAD(ldst_labels, TCGLabelQemuLdstOol) ldst_ool_labels;
+GHashTable *ldst_ool_thunks;
+#endif
 #ifdef TCG_TARGET_NEED_POOL_LABELS
 struct TCGLabelPoolData *pool_labels;
 #endif
diff --git a/tcg/tcg-ldst-ool.inc.c b/tcg/tcg-ldst-ool.inc.c
new file mode 100644
index 00..8fb6550a8d
--- /dev/null
+++ b/tcg/tcg-ldst-ool.inc.c
@@ -0,0 +1,94 @@
+/*
+ * TCG Backend Data: load-store optimization only.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+typedef struct TCGLabelQemuLdstOol {
+QSIMPLEQ_ENTRY(TCGLabelQemuLdstOol) next;
+tcg_insn_unit *label;   /* label pointer to be updated */
+int reloc;  /* relocation type from label_ptr */
+intptr_t addend;/* relocation addend from label_ptr */
+uint32_t key;   /* oi : is_64 : is_ld */
+} TCGLabelQemuLdstOol;
+
+
+/*
+ * Generate TB finalization at the end of block
+ */
+
+static tcg_insn_unit *tcg_out_qemu_ldst_ool(TCGContext *s, bool is_ld,
+bool is64, TCGMemOpIdx oi);
+
+static bool tcg_out_ldst_ool_finalize(TCGContext *s)
+{
+TCGLabelQemuLdstOol *lb;
+
+/* qemu_ld/st slow paths */
+QSIMPLEQ_FOREACH(lb, >ldst_ool_labels, next) {
+gpointer dest, key = (gpointer)(uintptr_t)lb->key;
+TCGMemOpIdx oi;
+bool is_ld, is_64, ok;
+
+/* If we have generated the thunk, and it's still in range, all ok.  */
+dest = g_hash_table_lookup(s->ldst_ool_thunks, key);
+if (dest &&
+patch_reloc(lb->label, lb->reloc, (intptr_t)dest, lb->addend)) {
+continue;
+}
+
+/* Generate a new thunk.  */
+is_ld = extract32(lb->key, 0, 1);
+is_64 = extract32(lb->key, 1, 1);
+oi = extract32(lb->key, 2, 30);
+dest = tcg_out_qemu_ldst_ool(s, is_ld, is_64, oi);
+
+/* Test for (pending) buffer overflow.  The assumption is that any
+   one thunk beginning below the high water mark cannot overrun
+   the buffer completely.  Thus we can test for overflow after
+   generating code without having to check during generation.  */
+if (unlikely((void *)s->code_ptr > s->code_gen_highwater)) {
+return false;
+}
+
+/* Remember the thunk for next time.  */
+g_hash_table_replace(s->ldst_ool_thunks, key, dest);
+
+/* The new thunk must be in range.  */
+ok = patch_reloc(lb->label, lb->reloc, (intptr_t)dest, lb->addend);
+tcg_debug_assert(ok);
+}
+return true;
+}
+
+/*
+ * Allocate a new TCGLabelQemuLdstOol entry.
+ */
+
+static void add_ldst_ool_label(TCGContext *s, bool is_ld, bool is_64,
+   TCGMemOpIdx oi, int reloc, intptr_t addend)
+{
+TCGLabelQemuLdstOol *lb = tcg_malloc(sizeof(*lb));
+
+QSIMPLEQ_INSERT_TAIL(>ldst_ool_labels, lb, next);
+lb->label = s->code_ptr;
+lb->reloc = reloc;
+lb->addend = addend;
+lb->key = is_ld | (is_64 << 1) | (oi << 2);
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 54f1272187..885d842a12 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -521,6 +521,13 @@ static void tcg_region_assign(TCGContext *s, size_t 
curr_region)
 s->code_gen_ptr = start;
 s->code_gen_buffer_size = end - start;
 

[Qemu-devel] [PATCH for-4.0 04/17] tcg/i386: Force qemu_ld/st arguments into fixed registers

2018-11-12 Thread Richard Henderson
This is an incremental step toward moving the qemu_ld/st
code sequence out of line.

Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.inc.c | 193 +++---
 1 file changed, 159 insertions(+), 34 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 2a96ca4274..8a3e7690b6 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -171,6 +171,56 @@ static bool have_lzcnt;
 
 static tcg_insn_unit *tb_ret_addr;
 
+#ifdef CONFIG_SOFTMMU
+/*
+ * Constraint to choose a particular register.  This is used for softmmu
+ * loads and stores.  Registers with no assignment get an empty string.
+ */
+static const char * const one_reg_constraint[TCG_TARGET_NB_REGS] = {
+[TCG_REG_EAX] = "a",
+[TCG_REG_EBX] = "b",
+[TCG_REG_ECX] = "c",
+[TCG_REG_EDX] = "d",
+[TCG_REG_ESI] = "S",
+[TCG_REG_EDI] = "D",
+#if TCG_TARGET_REG_BITS == 64
+[TCG_REG_R8]  = "E",
+[TCG_REG_R9]  = "N",
+#endif
+};
+
+/*
+ * Calling convention for the softmmu load and store thunks.
+ *
+ * For 64-bit, we mostly use the host calling convention, therefore the
+ * real first argument is reserved for the ENV parameter that is passed
+ * on to the slow path helpers.
+ *
+ * For 32-bit, the host calling convention is stack based; we invent a
+ * private convention that uses 4 of the 6 available host registers, and
+ * we reserve EAX and EDX as temporaries for use by the thunk.
+ */
+static inline TCGReg softmmu_arg(unsigned n)
+{
+if (TCG_TARGET_REG_BITS == 64) {
+tcg_debug_assert(n < ARRAY_SIZE(tcg_target_call_iarg_regs) - 1);
+return tcg_target_call_iarg_regs[n + 1];
+} else {
+static const TCGReg local_order[] = {
+TCG_REG_ESI, TCG_REG_EDI, TCG_REG_ECX, TCG_REG_EBX
+};
+tcg_debug_assert(n < ARRAY_SIZE(local_order));
+return local_order[n];
+}
+}
+
+#define qemu_memop_arg(N)  one_reg_constraint[softmmu_arg(N)]
+#define qemu_memop_ret(N)  (N ? "d" : "a")
+#else
+#define qemu_memop_arg(N)  "L"
+#define qemu_memop_ret(N)  "L"
+#endif /* CONFIG_SOFTMMU */
+
 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
 intptr_t value, intptr_t addend)
 {
@@ -1677,11 +1727,15 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg 
addrlo, TCGReg addrhi,
copies the entire guest address for the slow path, while truncation
for the 32-bit host happens with the fastpath ADDL below.  */
 if (TCG_TARGET_REG_BITS == 64) {
-base = tcg_target_call_iarg_regs[1];
+tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]);
+if (TARGET_LONG_BITS == 32) {
+tcg_out_ext32u(s, addrlo, addrlo);
+}
+base = addrlo;
 } else {
 base = r1;
+tcg_out_mov(s, ttype, base, addrlo);
 }
-tcg_out_mov(s, ttype, base, addrlo);
 
 /* jne slow_path */
 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
@@ -2006,16 +2060,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
common. */
 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
 {
-TCGReg datalo, datahi, addrlo;
-TCGReg addrhi __attribute__((unused));
+TCGReg datalo, addrlo;
+TCGReg datahi __attribute__((unused)) = -1;
+TCGReg addrhi __attribute__((unused)) = -1;
 TCGMemOpIdx oi;
 TCGMemOp opc;
+int i = -1;
 
-datalo = *args++;
-datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
-addrlo = *args++;
-addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
-oi = *args++;
+datalo = args[++i];
+if (TCG_TARGET_REG_BITS == 32 && is64) {
+datahi = args[++i];
+}
+addrlo = args[++i];
+if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+addrhi = args[++i];
+}
+oi = args[++i];
 opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
@@ -2024,6 +2084,15 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg 
*args, bool is64)
 tcg_insn_unit *label_ptr[2];
 TCGReg base;
 
+tcg_debug_assert(datalo == tcg_target_call_oarg_regs[0]);
+if (TCG_TARGET_REG_BITS == 32 && is64) {
+tcg_debug_assert(datahi == tcg_target_call_oarg_regs[1]);
+}
+tcg_debug_assert(addrlo == softmmu_arg(0));
+if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+tcg_debug_assert(addrhi == softmmu_arg(1));
+}
+
 base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
 label_ptr, offsetof(CPUTLBEntry, addr_read));
 
@@ -2146,16 +2215,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s, 
TCGReg datalo, TCGReg datahi,
 
 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 {
-TCGReg datalo, datahi, addrlo;
-TCGReg addrhi __attribute__((unused));
+TCGReg datalo, addrlo;
+TCGReg datahi __attribute__((unused)) = -1;

[Qemu-devel] [PATCH for-4.0 12/17] tcg/aarch64: Use TCG_TARGET_NEED_LDST_OOL_LABELS

2018-11-12 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 tcg/aarch64/tcg-target.h |   2 +-
 tcg/aarch64/tcg-target.inc.c | 191 +--
 2 files changed, 93 insertions(+), 100 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 9aea1d1771..d1bd77c41d 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -146,7 +146,7 @@ static inline void flush_icache_range(uintptr_t start, 
uintptr_t stop)
 void tb_target_set_jmp_target(uintptr_t, uintptr_t, uintptr_t);
 
 #ifdef CONFIG_SOFTMMU
-#define TCG_TARGET_NEED_LDST_LABELS
+#define TCG_TARGET_NEED_LDST_OOL_LABELS
 #endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 403f5caf14..8edea527f7 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -145,18 +145,6 @@ static const char 
*target_parse_constraint(TCGArgConstraint *ct,
 ct->ct |= TCG_CT_REG;
 ct->u.regs |= 0xull;
 break;
-case 'l': /* qemu_ld / qemu_st address, data_reg */
-ct->ct |= TCG_CT_REG;
-ct->u.regs = 0xu;
-#ifdef CONFIG_SOFTMMU
-/* x0 and x1 will be overwritten when reading the tlb entry,
-   and x2, and x3 for helper args, better to avoid using them. */
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_X0);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_X1);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_X2);
-tcg_regset_reset_reg(ct->u.regs, TCG_REG_X3);
-#endif
-break;
 case 'A': /* Valid for arithmetic immediate (positive or negative).  */
 ct->ct |= TCG_CT_CONST_AIMM;
 break;
@@ -1378,7 +1366,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 }
 
 #ifdef CONFIG_SOFTMMU
-#include "tcg-ldst.inc.c"
+#include "tcg-ldst-ool.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * TCGMemOpIdx oi, uintptr_t ra)
@@ -1391,6 +1379,12 @@ static void * const qemu_ld_helpers[16] = {
 [MO_BEUW] = helper_be_lduw_mmu,
 [MO_BEUL] = helper_be_ldul_mmu,
 [MO_BEQ]  = helper_be_ldq_mmu,
+
+[MO_SB]   = helper_ret_ldsb_mmu,
+[MO_LESW] = helper_le_ldsw_mmu,
+[MO_LESL] = helper_le_ldsl_mmu,
+[MO_BESW] = helper_be_ldsw_mmu,
+[MO_BESL] = helper_be_ldsl_mmu,
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1407,67 +1401,6 @@ static void * const qemu_st_helpers[16] = {
 [MO_BEQ]  = helper_be_stq_mmu,
 };
 
-static inline void tcg_out_adr(TCGContext *s, TCGReg rd, void *target)
-{
-ptrdiff_t offset = tcg_pcrel_diff(s, target);
-tcg_debug_assert(offset == sextract64(offset, 0, 21));
-tcg_out_insn(s, 3406, ADR, rd, offset);
-}
-
-static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-{
-TCGMemOpIdx oi = lb->oi;
-TCGMemOp opc = get_memop(oi);
-TCGMemOp size = opc & MO_SIZE;
-
-reloc_pc19(lb->label_ptr[0], s->code_ptr);
-
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X2, oi);
-tcg_out_adr(s, TCG_REG_X3, lb->raddr);
-tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-if (opc & MO_SIGN) {
-tcg_out_sxt(s, lb->type, size, lb->datalo_reg, TCG_REG_X0);
-} else {
-tcg_out_mov(s, size == MO_64, lb->datalo_reg, TCG_REG_X0);
-}
-
-tcg_out_goto(s, lb->raddr);
-}
-
-static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
-{
-TCGMemOpIdx oi = lb->oi;
-TCGMemOp opc = get_memop(oi);
-TCGMemOp size = opc & MO_SIZE;
-
-reloc_pc19(lb->label_ptr[0], s->code_ptr);
-
-tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_X0, TCG_AREG0);
-tcg_out_mov(s, TARGET_LONG_BITS == 64, TCG_REG_X1, lb->addrlo_reg);
-tcg_out_mov(s, size == MO_64, TCG_REG_X2, lb->datalo_reg);
-tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_X3, oi);
-tcg_out_adr(s, TCG_REG_X4, lb->raddr);
-tcg_out_call(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-tcg_out_goto(s, lb->raddr);
-}
-
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
-TCGType ext, TCGReg data_reg, TCGReg addr_reg,
-tcg_insn_unit *raddr, tcg_insn_unit *label_ptr)
-{
-TCGLabelQemuLdst *label = new_ldst_label(s);
-
-label->is_ld = is_ld;
-label->oi = oi;
-label->type = ext;
-label->datalo_reg = data_reg;
-label->addrlo_reg = addr_reg;
-label->raddr = raddr;
-label->label_ptr[0] = label_ptr;
-}
-
 /*
  * Load and compare a TLB entry, emitting the conditional jump to the
  * slow path on failure.  Returns the register for the host addend.
@@ -1644,19 +1577,22 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg 
data_reg, TCGReg addr_reg,
 TCGMemOpIdx oi, TCGType ext)
 {

  1   2   3   4   >