Complete the mlx5 driver by adding CQ/QP creation, QP state transitions, WQE posting, CQ polling, and the memcpy_start/memcpy_wait callbacks. After this patch the driver is functional for DMA tests.
The data path implements RDMA Write self-loopback via an RC QP with force-loopback. WQEs are posted to a 16-entry send queue with an NC doorbell, and completions are polled from a 16-entry CQ. Assisted-by: Claude:claude-opus-4.6 Signed-off-by: Jason Gunthorpe <[email protected]> --- .../selftests/vfio/lib/drivers/mlx5/mlx5.c | 359 +++++++++++++++++- 1 file changed, 357 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c index 804801cc564e7a..e5e75adb253166 100644 --- a/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c +++ b/tools/testing/selftests/vfio/lib/drivers/mlx5/mlx5.c @@ -1343,6 +1343,354 @@ static void mlx5st_destroy_mkey(struct mlx5st_device *dev) mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +/* + * CQ create/destroy + */ + +static void mlx5st_create_cq(struct mlx5st_device *dev) +{ + struct vfio_pci_device *device = dev->device; + u64 in[MLX5_ST_SZ_QW(create_cq_in) + 1] = {}; + u32 out[MLX5_ST_SZ_DW(create_cq_out)] = {}; + struct mlx5_ifc_cqc_bits *cqc; + unsigned int i; + __be64 *pas; + + /* Initialize CQEs before CREATE_CQ: opcode=0xF, owner=1 */ + for (i = 0; i < CQ_CQE_CNT; i++) { + struct mlx5st_cqe64 *cqe = &dev->cq_buf[i]; + + MLX5_SET(cqe64, cqe, opcode, 0xF); + MLX5_SET_ONCE(cqe64, cqe, owner, 1); + } + + MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ); + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, LOG_CQ_SIZE); + MLX5_SET(cqc, cqc, uar_page, dev->uar_page); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, dev->eqn); + MLX5_SET(cqc, cqc, cqe_sz, 0); + pas = MLX5_ADDR_OF(create_cq_in, in, pas); + MLX5_SET(cqc, cqc, page_offset, mlx5st_fill_pas(device, dev->cq_buf, pas)); + MLX5_SET(cqc, cqc, log_page_size, 0); + MLX5_SET64(cqc, cqc, dbr_addr, to_iova(device, &dev->cq_dbrec)); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + + dev->cqn = MLX5_GET(create_cq_out, out, cqn); + dev->cq_ci = 0; + dev_dbg(device, "Created CQ: cqn=%u, %d entries\n", dev->cqn, + CQ_CQE_CNT); +} + +static void mlx5st_destroy_cq(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {}; + + MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ); + MLX5_SET(destroy_cq_in, in, cqn, dev->cqn); + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +/* + * QP create/destroy + */ + +static void mlx5st_create_qp(struct mlx5st_device *dev) +{ + struct vfio_pci_device *device = dev->device; + u64 in[MLX5_ST_SZ_QW(create_qp_in) + 1] = {}; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct mlx5_ifc_qpc_bits *qpc; + __be64 *pas; + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, st, MLX5_QPC_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + MLX5_SET(qpc, qpc, pd, dev->pdn); + MLX5_SET(qpc, qpc, uar_page, dev->uar_page); + MLX5_SET(qpc, qpc, cqn_snd, dev->cqn); + MLX5_SET(qpc, qpc, cqn_rcv, dev->cqn); + MLX5_SET(qpc, qpc, log_sq_size, LOG_SQ_SIZE); + MLX5_SET(qpc, qpc, log_msg_max, dev->log_max_msg); + MLX5_SET(qpc, qpc, rq_type, 0x3); + MLX5_SET(qpc, qpc, ts_format, 1); + pas = MLX5_ADDR_OF(create_qp_in, in, pas); + MLX5_SET(qpc, qpc, page_offset, + mlx5st_fill_pas(device, dev->sq_buf, pas)); + MLX5_SET(qpc, qpc, log_page_size, 0); + MLX5_SET64(qpc, qpc, dbr_addr, to_iova(device, &dev->qp_dbrec)); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + + dev->qpn = MLX5_GET(create_qp_out, out, qpn); + dev->sq_pi = 0; + dev_dbg(device, "Created QP: qpn=%u, RC, sq=%d wqes\n", dev->qpn, + SQ_WQE_CNT); +} + +static void mlx5st_destroy_qp(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, dev->qpn); + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +/* + * QP state transitions + */ + +static void mlx5st_qp_rst2init(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(rst2init_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rst2init_qp_in, in, qpc); + + MLX5_SET(rst2init_qp_in, in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, in, qpn, dev->qpn); + + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + MLX5_SET(qpc, qpc, rre, 1); + MLX5_SET(qpc, qpc, rwe, 1); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + dev_dbg(dev->device, "QP RST->INIT\n"); +} + +static void mlx5st_qp_init2rtr(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(init2rtr_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc); + + MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, in, qpn, dev->qpn); + + MLX5_SET(qpc, qpc, mtu, 3); + MLX5_SET(qpc, qpc, log_msg_max, dev->log_max_msg); + MLX5_SET(qpc, qpc, remote_qpn, dev->qpn); + MLX5_SET(qpc, qpc, min_rnr_nak, 12); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, primary_address_path.fl, 1); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + dev_dbg(dev->device, "QP INIT->RTR (fl=1)\n"); +} + +static void mlx5st_qp_rtr2rts(struct mlx5st_device *dev) +{ + u32 out[MLX5_ST_SZ_DW(rtr2rts_qp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + struct mlx5_ifc_qpc_bits *qpc = MLX5_ADDR_OF(rtr2rts_qp_in, in, qpc); + + MLX5_SET(rtr2rts_qp_in, in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, in, qpn, dev->qpn); + + MLX5_SET(qpc, qpc, log_ack_req_freq, 0); + MLX5_SET(qpc, qpc, retry_count, 7); + MLX5_SET(qpc, qpc, rnr_retry, 7); + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 14); + + mlx5st_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + dev_dbg(dev->device, "QP RTR->RTS\n"); +} + +/* + * Post RDMA Write WQE + */ +static void mlx5st_post_rdma_write(struct mlx5st_device *dev, u64 src_addr, + u32 src_lkey, u64 dst_addr, u32 dst_rkey, + u32 length, bool signaled) +{ + struct mlx5st_send_wqe *wqe; + unsigned int idx; + + idx = dev->sq_pi % SQ_WQE_CNT; + wqe = &dev->sq_buf[idx]; + + memset(wqe, 0, sizeof(*wqe)); + MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, opcode, MLX5_OPCODE_RDMA_WRITE); + MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, wqe_index, dev->sq_pi); + MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, qp_or_sq, dev->qpn); + MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ds, MLX5_RDMA_WRITE_DS); + if (signaled) + MLX5_SET(wqe_ctrl_seg, &wqe->ctrl, ce, MLX5_WQE_CE_CQE_ALWAYS); + + MLX5_SET64(wqe_raddr_seg, &wqe->raddr, raddr, dst_addr); + MLX5_SET(wqe_raddr_seg, &wqe->raddr, rkey, dst_rkey); + + MLX5_SET(wqe_data_seg, &wqe->data, byte_count, length); + MLX5_SET(wqe_data_seg, &wqe->data, lkey, src_lkey); + MLX5_SET64(wqe_data_seg, &wqe->data, addr, src_addr); + + dev->sq_pi++; + + /* Ensure WQE is visible to device before doorbell record */ + dma_wmb(); + + WRITE_ONCE(dev->qp_dbrec.send_counter, + cpu_to_be32(dev->sq_pi & 0xffff)); + + /* + * Ring doorbell: write first 8 bytes of ctrl to UAR BF register, + * iowrite has an internal dma_wmb() so the doorbell record will be + * visible. + */ + iowrite64be(be64_to_cpu(*(__be64 *)wqe), + (u8 __iomem *)dev->uar_base + dev->uar_bf_offset); + dev->uar_bf_offset ^= MLX5_BF_SIZE; +} + +/* + * Poll CQ + */ +static int mlx5st_poll_cq_batch(struct mlx5st_device *dev, + unsigned int max_cqe) +{ + unsigned int polled = 0; + + while (polled < max_cqe) { + unsigned int idx = dev->cq_ci % CQ_CQE_CNT; + struct mlx5st_cqe64 *cqe = &dev->cq_buf[idx]; + u8 owner, opcode; + + owner = MLX5_GET_ONCE(cqe64, cqe, owner); + if (owner != ((dev->cq_ci >> LOG_CQ_SIZE) & 1)) + break; + + dma_rmb(); + + opcode = MLX5_GET(cqe64, cqe, opcode); + + dev->cq_ci++; + WRITE_ONCE(dev->cq_dbrec.recv_counter, + cpu_to_be32(dev->cq_ci & 0xffffff)); + + if (opcode == MLX5_CQE_REQ) { + dev->sq_ci = + (u16)(MLX5_GET(cqe64, cqe, wqe_counter) + 1); + polled++; + continue; + } + if (opcode == MLX5_CQE_REQ_ERR || + opcode == MLX5_CQE_RESP_ERR) { + dev_dbg(dev->device, + "CQE error: opcode=0x%x syndrome=0x%x vendor=0x%x\n", + opcode, + MLX5_GET(cqe64, cqe, error_syndrome.syndrome), + MLX5_GET(cqe64, cqe, + error_syndrome.vendor_error_syndrome)); + return -1; + } + dev_err(dev->device, "CQE unexpected opcode=0x%x\n", opcode); + return -1; + } + + return polled; +} + +static int mlx5st_poll_cq(struct mlx5st_device *dev, unsigned int timeout_ms) +{ + struct timespec start, now; + unsigned int elapsed; + int ret; + + clock_gettime(CLOCK_MONOTONIC, &start); + for (;;) { + ret = mlx5st_poll_cq_batch(dev, 1); + if (ret < 0) + return -1; + if (ret > 0) + return 0; + + if (dev->have_eq) + mlx5st_process_events(dev); + + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed = (now.tv_sec - start.tv_sec) * 1000 + + (now.tv_nsec - start.tv_nsec) / 1000000; + if (elapsed > timeout_ms) { + dev_err(dev->device, "CQ poll timeout after %u ms\n", + timeout_ms); + return -1; + } + } +} + +/* + * Data path setup/teardown helpers + */ + +static void mlx5st_setup_datapath(struct mlx5st_device *dev) +{ + mlx5st_create_cq(dev); + mlx5st_create_qp(dev); + mlx5st_qp_rst2init(dev); + mlx5st_qp_init2rtr(dev); + mlx5st_qp_rtr2rts(dev); +} + +static void mlx5st_teardown_datapath(struct mlx5st_device *dev) +{ + if (dev->qpn) { + mlx5st_destroy_qp(dev); + dev->qpn = 0; + } + if (dev->cqn) { + mlx5st_destroy_cq(dev); + dev->cqn = 0; + } + dev->sq_pi = 0; + dev->sq_ci = 0; + memset(&dev->qp_dbrec, 0, sizeof(dev->qp_dbrec)); + memset(&dev->cq_dbrec, 0, sizeof(dev->cq_dbrec)); +} + +/* + * memcpy callbacks + */ + +#define MLX5ST_MEMCPY_TIMEOUT_MS 60000 + +static void mlx5st_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count) +{ + struct mlx5st_device *dev = to_mlx5st(device); + u64 i; + + for (i = 0; i < count; i++) { + bool signaled = (i == count - 1); + + mlx5st_post_rdma_write(dev, src, dev->global_lkey, dst, + dev->global_rkey, size, signaled); + } +} + +static int mlx5st_memcpy_wait(struct vfio_pci_device *device) +{ + struct mlx5st_device *dev = to_mlx5st(device); + int ret; + + ret = mlx5st_poll_cq(dev, MLX5ST_MEMCPY_TIMEOUT_MS); + if (ret) { + /* + * CQE error puts the QP in error state. Rebuild the data path + * so subsequent operations can succeed. + */ + mlx5st_teardown_datapath(dev); + mlx5st_setup_datapath(dev); + } + return ret; +} + /* * Driver ops callbacks */ @@ -1373,6 +1721,11 @@ static void mlx5st_init(struct vfio_pci_device *device) mlx5st_alloc_pd(dev); mlx5st_create_mkey(dev); + mlx5st_setup_datapath(dev); + + device->driver.max_memcpy_size = 1ULL << dev->log_max_msg; + device->driver.max_memcpy_count = SQ_WQE_CNT - 1; + dev_dbg(device, "mlx5 driver initialized\n"); } @@ -1380,6 +1733,8 @@ static void mlx5st_remove(struct vfio_pci_device *device) { struct mlx5st_device *dev = to_mlx5st(device); + mlx5st_teardown_datapath(dev); + dev_dbg(device, "teardown: destroy_mkey\n"); if (dev->mkey_index) { mlx5st_destroy_mkey(dev); @@ -1408,7 +1763,7 @@ struct vfio_pci_driver_ops mlx5st_ops = { .probe = mlx5st_probe, .init = mlx5st_init, .remove = mlx5st_remove, - .memcpy_start = NULL, - .memcpy_wait = NULL, + .memcpy_start = mlx5st_memcpy_start, + .memcpy_wait = mlx5st_memcpy_wait, .send_msi = NULL, }; -- 2.43.0
