I won't have time to review before the next year. Results of a quick peek. On Sun, Dec 17, 2017 at 02:54:56PM +0200, Marcel Apfelbaum wrote: > +static void *mad_handler_thread(void *arg) > +{ > + PVRDMADev *dev = (PVRDMADev *)arg; > + int rc; > + QObject *o_ctx_id; > + unsigned long cqe_ctx_id; > + BackendCtx *bctx; > + /* > + int len; > + void *mad; > + */ > + > + pr_dbg("Starting\n"); > + > + dev->backend_dev.mad_thread.run = false; > + > + while (dev->backend_dev.mad_thread.run) { > + /* Get next buffer to pust MAD into */ > + o_ctx_id = qlist_pop(dev->backend_dev.mad_agent.recv_mads_list); > + if (!o_ctx_id) { > + /* pr_dbg("Error: No more free MADs buffers\n"); */ > + sleep(5);
Looks suspicious. What is above doing? > + continue; > + } > + cqe_ctx_id = qnum_get_uint(qobject_to_qnum(o_ctx_id)); > + bctx = rm_get_cqe_ctx(dev, cqe_ctx_id); > + if (unlikely(!bctx)) { > + pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id); > + continue; > + } > + > + pr_dbg("Calling umad_recv\n"); > + /* > + mad = pvrdma_pci_dma_map(PCI_DEVICE(dev), bctx->req.sge[0].addr, > + bctx->req.sge[0].length); > + > + len = bctx->req.sge[0].length; > + > + do { > + rc = umad_recv(dev->backend_dev.mad_agent.port_id, mad, &len, > 5000); That's a huge timeout. > + } while ( (rc != ETIMEDOUT) && dev->backend_dev.mad_thread.run); > + pr_dbg("umad_recv, rc=%d\n", rc); > + > + pvrdma_pci_dma_unmap(PCI_DEVICE(dev), mad, bctx->req.sge[0].length); > + */ > + rc = -1; > + > + /* rc is used as vendor_err */ > + comp_handler(rc > 0 ? IB_WC_SUCCESS : IB_WC_GENERAL_ERR, rc, > + bctx->up_ctx); > + > + rm_dealloc_cqe_ctx(dev, cqe_ctx_id); > + free(bctx); > + } > + > + pr_dbg("Going down\n"); > + /* TODO: Post cqe for all remaining MADs in list */ > + > + qlist_destroy_obj(QOBJECT(dev->backend_dev.mad_agent.recv_mads_list)); > + > + return NULL; > +} > + > +static void *comp_handler_thread(void *arg) > +{ > + PVRDMADev *dev = (PVRDMADev *)arg; > + int rc; > + struct ibv_cq *ev_cq; > + void *ev_ctx; > + > + pr_dbg("Starting\n"); > + > + while (dev->backend_dev.comp_thread.run) { > + pr_dbg("Waiting for completion on channel %p\n", > + dev->backend_dev.channel); > + rc = ibv_get_cq_event(dev->backend_dev.channel, &ev_cq, &ev_ctx); > + pr_dbg("ibv_get_cq_event=%d\n", rc); > + if (unlikely(rc)) { > + pr_dbg("---> ibv_get_cq_event (%d)\n", rc); > + continue; > + } > + > + if (unlikely(ibv_req_notify_cq(ev_cq, 0))) { > + pr_dbg("---> ibv_req_notify_cq\n"); > + } > + > + poll_cq(dev, ev_cq, false); > + > + ibv_ack_cq_events(ev_cq, 1); > + } > + > + pr_dbg("Going down\n"); > + /* TODO: Post cqe for all remaining buffs that were posted */ > + > + return NULL; > +} > + > +void backend_register_comp_handler(void (*handler)(int status, > + unsigned int vendor_err, void *ctx)) > +{ > + comp_handler = handler; > +} > + > +int backend_query_port(BackendDevice *dev, struct pvrdma_port_attr *attrs) > +{ > + int rc; > + struct ibv_port_attr port_attr; > + > + rc = ibv_query_port(dev->context, dev->port_num, &port_attr); > + if (rc) { > + pr_dbg("Error %d from ibv_query_port\n", rc); > + return -EIO; > + } > + > + attrs->state = port_attr.state; > + attrs->max_mtu = port_attr.max_mtu; > + attrs->active_mtu = port_attr.active_mtu; > + attrs->gid_tbl_len = port_attr.gid_tbl_len; > + attrs->pkey_tbl_len = port_attr.pkey_tbl_len; > + attrs->phys_state = port_attr.phys_state; > + > + return 0; > +} > + > +void backend_poll_cq(PVRDMADev *dev, BackendCQ *cq) > +{ > + poll_cq(dev, cq->ibcq, true); > +} > + > +static GHashTable *ah_hash; > + > +static struct ibv_ah *create_ah(BackendDevice *dev, struct ibv_pd *pd, > + union ibv_gid *dgid, uint8_t sgid_idx) > +{ > + GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid)); > + struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key); > + > + if (ah) { > + trace_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix), > + be64_to_cpu(dgid->global.interface_id)); > + } else { > + struct ibv_ah_attr ah_attr = { > + .is_global = 1, > + .port_num = dev->port_num, > + .grh.hop_limit = 1, > + }; > + > + ah_attr.grh.dgid = *dgid; > + ah_attr.grh.sgid_index = sgid_idx; > + > + ah = ibv_create_ah(pd, &ah_attr); > + if (ah) { > + g_hash_table_insert(ah_hash, ah_key, ah); > + } else { > + pr_dbg("ibv_create_ah failed for gid <%lx %lx>\n", > + be64_to_cpu(dgid->global.subnet_prefix), > + be64_to_cpu(dgid->global.interface_id)); > + } > + > + trace_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix), > + be64_to_cpu(dgid->global.interface_id)); > + } > + > + return ah; > +} > + > +static void destroy_ah(gpointer data) > +{ > + struct ibv_ah *ah = data; > + ibv_destroy_ah(ah); > +} > + > +static void ah_cache_init(void) > +{ > + ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal, > + NULL, destroy_ah); > +} > + > +static int send_mad(PVRDMADev *dev, struct pvrdma_sge *sge, u32 num_sge) > +{ > + int ret = -1; > + > + /* > + * TODO: Currently QP1 is not supported > + * > + PCIDevice *pci_dev = PCI_DEVICE(dev); > + char mad_msg[1024]; > + void *hdr, *msg; > + struct ib_user_mad *umad = (struct ib_user_mad *)&mad_msg; > + > + umad->length = sge[0].length + sge[1].length; > + > + if (num_sge != 2) > + return -EINVAL; > + > + pr_dbg("msg_len=%d\n", umad->length); > + > + hdr = pvrdma_pci_dma_map(pci_dev, sge[0].addr, sge[0].length); > + msg = pvrdma_pci_dma_map(pci_dev, sge[1].addr, sge[1].length); > + > + memcpy(&mad_msg[64], hdr, sge[0].length); > + memcpy(&mad_msg[sge[0].length+64], msg, sge[1].length); > + > + pvrdma_pci_dma_unmap(pci_dev, msg, sge[1].length); > + pvrdma_pci_dma_unmap(pci_dev, hdr, sge[0].length); > + > + ret = umad_send(dev->backend_dev.mad_agent.port_id, > + dev->backend_dev.mad_agent.agent_id, > + mad_msg, umad->length, 10, 10); > + */ Then what is above code doing here? Also, isn't QP1 a big deal? If it's missing then how do you do multicast etc? How does guest know it's missing? ... > diff --git a/hw/net/pvrdma/pvrdma_utils.h b/hw/net/pvrdma/pvrdma_utils.h > new file mode 100644 > index 0000000000..a09e39946c > --- /dev/null > +++ b/hw/net/pvrdma/pvrdma_utils.h > @@ -0,0 +1,41 @@ > +/* > + * QEMU VMWARE paravirtual RDMA interface definitions > + * > + * Developed by Oracle & Redhat > + * > + * Authors: > + * Yuval Shaia <yuval.sh...@oracle.com> > + * Marcel Apfelbaum <mar...@redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * See the COPYING file in the top-level directory. > + * > + */ > + > +#ifndef PVRDMA_UTILS_H > +#define PVRDMA_UTILS_H > + > +#include <include/hw/pci/pci.h> > + > +#define pr_info(fmt, ...) \ > + fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ > + ## __VA_ARGS__) > + > +#define pr_err(fmt, ...) \ > + fprintf(stderr, "%s: Error at %-20s (%3d): " fmt, "pvrdma", __func__, \ > + __LINE__, ## __VA_ARGS__) > + > +#ifdef PVRDMA_DEBUG > +#define pr_dbg(fmt, ...) \ > + fprintf(stdout, "%s: %-20s (%3d): " fmt, "pvrdma", __func__, __LINE__,\ > + ## __VA_ARGS__) > +#else > +#define pr_dbg(fmt, ...) > +#endif > + > +void pvrdma_pci_dma_unmap(PCIDevice *dev, void *buffer, dma_addr_t len); > +void *pvrdma_pci_dma_map(PCIDevice *dev, dma_addr_t addr, dma_addr_t plen); > +void *map_to_pdir(PCIDevice *pdev, uint64_t pdir_dma, uint32_t nchunks, > + size_t length); > + > +#endif Can you make sure all prefixes are pvrdma_? > diff --git a/hw/net/pvrdma/trace-events b/hw/net/pvrdma/trace-events > new file mode 100644 > index 0000000000..bbc52286bc > --- /dev/null > +++ b/hw/net/pvrdma/trace-events > @@ -0,0 +1,9 @@ > +# See docs/tracing.txt for syntax documentation. > + > +# hw/net/pvrdma/pvrdma_main.c > +pvrdma_regs_read(uint64_t addr, uint64_t val) "regs[0x%lx] = 0x%lx" > +pvrdma_regs_write(uint64_t addr, uint64_t val) "regs[0x%lx] = 0x%lx" > + > +#hw/net/pvrdma/pvrdma_backend.c > +create_ah_cache_hit(uint64_t subnet, uint64_t net_id) "subnet = 0x%lx net_id > = 0x%lx" > +create_ah_cache_miss(uint64_t subnet, uint64_t net_id) "subnet = 0x%lx > net_id = 0x%lx" > diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h > index 35df1874a9..1dbf53627c 100644 > --- a/include/hw/pci/pci_ids.h > +++ b/include/hw/pci/pci_ids.h > @@ -266,4 +266,7 @@ > #define PCI_VENDOR_ID_TEWS 0x1498 > #define PCI_DEVICE_ID_TEWS_TPCI200 0x30C8 > > +#define PCI_VENDOR_ID_VMWARE 0x15ad > +#define PCI_DEVICE_ID_VMWARE_PVRDMA 0x0820 > + > #endif > -- > 2.13.5