Implement the dmadev fast path for the AMD AE4DMA PMD. This commit adds: - copy enqueue (rte_dma_copy): write an AE4DMA descriptor for a memory-to-memory transfer; on RTE_DMA_OP_FLAG_SUBMIT the doorbell is rung immediately. - submit (rte_dma_submit): advance the per-queue write_idx register to expose pending descriptors to the hardware. - completion (rte_dma_completed / rte_dma_completed_status): completion is detected via the hardware's per-queue read_idx register, which the engine advances as it processes descriptors. The descriptor status / err_code bytes are read only to classify each drained slot as success or failure, and HW error codes are translated to the dmadev RTE_DMA_STATUS_* enumeration. - burst capacity (rte_dma_burst_capacity): report the number of free descriptor slots, taking into account the one slot reserved to distinguish full from empty on the power-of-two ring.
The fast path entry points are wired through fp_obj in ae4dma_dmadev_create(). The fill capability is not advertised; fp_obj->fill is left zero-initialised. Signed-off-by: Raghavendra Ningoji <[email protected]> --- doc/guides/dmadevs/ae4dma.rst | 22 +++ drivers/dma/ae4dma/ae4dma_dmadev.c | 288 +++++++++++++++++++++++++++++ 2 files changed, 310 insertions(+) diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst index a85c1d92ca..37a2096ccf 100644 --- a/doc/guides/dmadevs/ae4dma.rst +++ b/doc/guides/dmadevs/ae4dma.rst @@ -51,3 +51,25 @@ On probe the PMD performs the following steps for each PCI function: IOVA-contiguous memory, programs the queue base address and ring depth into the per-queue registers, and enables the queue. * Interrupts are masked; completion is polled by the application. + +Usage +----- + +Once a dmadev has been started, copies are submitted with +``rte_dma_copy()`` and completions are reaped with ``rte_dma_completed()`` +or ``rte_dma_completed_status()``. See the +:ref:`Enqueue / Dequeue API <dmadev_enqueue_dequeue>` section of the +dmadev library documentation for details. + +Limitations +----------- + +* Only memory-to-memory copies are supported. Fill, scatter-gather and + any other operation types are not advertised in + ``rte_dma_info::dev_capa``. +* The maximum number of descriptors per virtual channel is fixed by + hardware at 32. The PMD rounds the requested ring size up to a + power of two and clamps it to 32. +* Only a single virtual channel per dmadev is supported; use the 16 + per-PCI-function dmadevs to obtain channel-level parallelism. +* Interrupt-driven completion is not supported. diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c b/drivers/dma/ae4dma/ae4dma_dmadev.c index dfda723c13..0f223fc40c 100644 --- a/drivers/dma/ae4dma/ae4dma_dmadev.c +++ b/drivers/dma/ae4dma/ae4dma_dmadev.c @@ -167,6 +167,73 @@ ae4dma_dev_close(struct rte_dma_dev *dev) cmd_q->qbase_phys_addr = 0; return 0; } + +/* trigger h/w to process enqued desc:doorbell - by next_write */ +static inline void +__submit(struct ae4dma_dmadev *ae4dma) +{ + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; + uint16_t write_idx = cmd_q->next_write; + uint16_t nb = cmd_q->qcfg.nb_desc; + + AE4DMA_WRITE_REG(&cmd_q->hwq_regs->write_idx, write_idx); + if (nb != 0) + cmd_q->stats.submitted += (uint16_t)((cmd_q->next_write - cmd_q->last_write + + nb) % nb); + cmd_q->last_write = cmd_q->next_write; +} + +static int +ae4dma_submit(void *dev_private, uint16_t vchan __rte_unused) +{ + struct ae4dma_dmadev *ae4dma = dev_private; + + __submit(ae4dma); + return 0; +} + +/* Write descriptor for enqueue (copy only). */ +static inline int +__write_desc_copy(void *dev_private, rte_iova_t src, rte_iova_t dst, + uint32_t len, uint64_t flags) +{ + struct ae4dma_dmadev *ae4dma = dev_private; + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; + struct ae4dma_desc *dma_desc; + uint16_t ret; + uint16_t nb = cmd_q->qcfg.nb_desc; + uint16_t write = cmd_q->next_write; + + if (nb == 0) + return -EINVAL; + + /* Reserve one slot to distinguish full from empty (power-of-two ring). */ + if ((uint32_t)cmd_q->ring_buff_count >= (uint32_t)(nb - 1)) + return -ENOSPC; + + dma_desc = &cmd_q->qbase_desc[write]; + memset(dma_desc, 0, sizeof(*dma_desc)); + dma_desc->length = len; + dma_desc->src_hi = upper_32_bits(src); + dma_desc->src_lo = lower_32_bits(src); + dma_desc->dst_hi = upper_32_bits(dst); + dma_desc->dst_lo = lower_32_bits(dst); + cmd_q->ring_buff_count++; + cmd_q->next_write = (uint16_t)((write + 1) % nb); + ret = write; + if (flags & RTE_DMA_OP_FLAG_SUBMIT) + __submit(ae4dma); + return ret; +} + +/* Enqueue a copy operation onto the ae4dma device. */ +static int +ae4dma_enqueue_copy(void *dev_private, uint16_t vchan __rte_unused, + rte_iova_t src, rte_iova_t dst, uint32_t length, uint64_t flags) +{ + return __write_desc_copy(dev_private, src, dst, length, flags); +} + /* Dump DMA device info. */ static int ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f) @@ -197,6 +264,220 @@ ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f) cmd_q->stats.errors); return 0; } + +/* Translates AE4DMA ChanERRs to DMA error codes. */ +static inline enum rte_dma_status_code +__translate_status_ae4dma_to_dma(enum ae4dma_dma_err status) +{ + AE4DMA_PMD_DEBUG("ae4dma desc status = %d", status); + + switch (status) { + case AE4DMA_DMA_ERR_NO_ERR: + return RTE_DMA_STATUS_SUCCESSFUL; + case AE4DMA_DMA_ERR_INV_LEN: + return RTE_DMA_STATUS_INVALID_LENGTH; + case AE4DMA_DMA_ERR_INV_SRC: + return RTE_DMA_STATUS_INVALID_SRC_ADDR; + case AE4DMA_DMA_ERR_INV_DST: + return RTE_DMA_STATUS_INVALID_DST_ADDR; + case AE4DMA_DMA_ERR_INV_ALIGN: + /* Name matches DPDK public enum spelling. */ + return RTE_DMA_STATUS_DATA_POISION; + case AE4DMA_DMA_ERR_INV_HEADER: + case AE4DMA_DMA_ERR_INV_STATUS: + return RTE_DMA_STATUS_ERROR_UNKNOWN; + default: + return RTE_DMA_STATUS_ERROR_UNKNOWN; + } +} + +/* + * Scan HW queue for completed descriptors (non-blocking). + * + * The AE4DMA engine signals completion by advancing the per-queue + * `read_idx` register; it does not (reliably) write a status value + * back into the descriptor. We therefore use the HW `read_idx` + * register as the source of truth and only inspect the descriptor's + * `dw1.err_code` byte to classify each completion as success or + * failure. + * + * @param cmd_q + * The AE4DMA command queue. + * @param max_ops + * Maximum descriptors to process this call. + * @param[out] failed_count + * Number of completed descriptors that did not report success. + * @return + * Number of descriptors completed (success + failure), <= max_ops. + */ +static inline uint16_t +ae4dma_scan_hwq(struct ae4dma_cmd_queue *cmd_q, uint16_t max_ops, + uint16_t *failed_count) +{ + volatile struct ae4dma_desc *hw_desc; + uint16_t events_count = 0, fails = 0; + uint16_t tail; + uint16_t nb = cmd_q->qcfg.nb_desc; + uint16_t mask; + uint16_t hw_read_idx; + uint16_t in_flight; + uint16_t scan_cap; + + if (nb == 0 || cmd_q->ring_buff_count == 0) { + *failed_count = 0; + return 0; + } + mask = nb - 1; + + hw_read_idx = (uint16_t)(AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx) & mask); + tail = cmd_q->next_read; + + /* + * Descriptors completed since our last visit live in the + * half-open ring range [tail, hw_read_idx). If HW hasn't + * moved we have nothing to do. + */ + in_flight = (uint16_t)((hw_read_idx - tail) & mask); + if (in_flight == 0) { + *failed_count = 0; + return 0; + } + + scan_cap = max_ops; + if (scan_cap > AE4DMA_DESCRIPTORS_PER_CMDQ) + scan_cap = AE4DMA_DESCRIPTORS_PER_CMDQ; + if (scan_cap > in_flight) + scan_cap = in_flight; + if (scan_cap > cmd_q->ring_buff_count) + scan_cap = (uint16_t)cmd_q->ring_buff_count; + + while (events_count < scan_cap) { + uint8_t hw_status; + uint8_t hw_err; + + hw_desc = &cmd_q->qbase_desc[tail]; + hw_status = hw_desc->dw1.status; + hw_err = hw_desc->dw1.err_code; + + /* + * read_idx advancing is the definitive completion + * signal. The per-descriptor status byte is informational + * and may not yet be written when we observe it: + * + * AE4DMA_DMA_DESC_ERROR (4) + * Hard failure - err_code names the precise cause. + * AE4DMA_DMA_DESC_COMPLETED (3) or 0 + * Success. + * AE4DMA_DMA_DESC_VALIDATED (1) / _PROCESSED (2) + * Benign race: HW had not finished updating the + * status byte at the instant we read it. Since + * read_idx has moved past this slot, treat it as + * success unless err_code says otherwise. + * + * A non-zero err_code is treated as a failure regardless + * of the observed status value. + */ + if (hw_status == AE4DMA_DMA_DESC_ERROR || + hw_err != AE4DMA_DMA_ERR_NO_ERR) { + fails++; + AE4DMA_PMD_WARN("Desc failed: status=%u err=%u", + hw_status, hw_err); + } + cmd_q->status[events_count] = (enum ae4dma_dma_err)hw_err; + cmd_q->ring_buff_count--; + events_count++; + tail = (tail + 1) & mask; + } + + cmd_q->stats.completed += events_count; + cmd_q->stats.errors += fails; + cmd_q->next_read = tail; + *failed_count = fails; + return events_count; +} + +/* Returns successful operations count and sets error flag if any errors. */ +static uint16_t +ae4dma_completed(void *dev_private, uint16_t vchan __rte_unused, + const uint16_t max_ops, uint16_t *last_idx, bool *has_error) +{ + struct ae4dma_dmadev *ae4dma = dev_private; + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; + uint16_t cpl_count, sl_count; + uint16_t err_count = 0; + uint16_t nb = cmd_q->qcfg.nb_desc; + + *has_error = false; + + cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count); + + if (cpl_count > max_ops) + cpl_count = max_ops; + + if (cpl_count > 0 && last_idx != NULL) + *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb); + + sl_count = cpl_count - err_count; + if (err_count) + *has_error = true; + + return sl_count; +} + +static uint16_t +ae4dma_completed_status(void *dev_private, uint16_t vchan __rte_unused, + uint16_t max_ops, uint16_t *last_idx, + enum rte_dma_status_code *status) +{ + struct ae4dma_dmadev *ae4dma = dev_private; + struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; + uint16_t cpl_count; + uint16_t i; + uint16_t err_count = 0; + uint16_t nb = cmd_q->qcfg.nb_desc; + + cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count); + + if (cpl_count > max_ops) + cpl_count = max_ops; + + if (cpl_count > 0 && last_idx != NULL) + *last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb); + + if (likely(err_count == 0)) { + for (i = 0; i < cpl_count; i++) + status[i] = RTE_DMA_STATUS_SUCCESSFUL; + } else { + for (i = 0; i < cpl_count; i++) + status[i] = __translate_status_ae4dma_to_dma(cmd_q->status[i]); + } + + return cpl_count; +} + +/* Get the remaining capacity of the ring. */ +static uint16_t +ae4dma_burst_capacity(const void *dev_private, uint16_t vchan __rte_unused) +{ + const struct ae4dma_dmadev *ae4dma = dev_private; + const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q; + uint16_t nb = cmd_q->qcfg.nb_desc; + uint16_t mask; + uint16_t read_idx = cmd_q->next_read; + uint16_t write_idx = cmd_q->next_write; + uint16_t used; + + if (nb < 2 || !rte_is_power_of_2(nb)) + return 0; + + mask = nb - 1; + used = (uint16_t)((write_idx - read_idx) & mask); + /* One slot reserved (same rule as enqueue). */ + if (used >= nb - 1) + return 0; + return (uint16_t)(nb - 1 - used); +} + /* Retrieve the generic stats of a DMA device. */ static int ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused, @@ -357,6 +638,13 @@ ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn) dmadev->fp_obj->dev_private = dmadev->data->dev_private; dmadev->dev_ops = &ae4dma_dmadev_ops; + dmadev->fp_obj->burst_capacity = ae4dma_burst_capacity; + dmadev->fp_obj->completed = ae4dma_completed; + dmadev->fp_obj->completed_status = ae4dma_completed_status; + dmadev->fp_obj->copy = ae4dma_enqueue_copy; + dmadev->fp_obj->submit = ae4dma_submit; + /* fill capability not advertised: leave fp_obj->fill as zero-initialised. */ + ae4dma = dmadev->data->dev_private; ae4dma->dmadev = dmadev; ae4dma->pci = dev; -- 2.34.1

