ECSPI transfers only one word per frame in DMA mode, causing SCLK stalls between words due to BURST_LENGTH updates, which significantly impacts performance.
To improve throughput, configure BURST_LENGTH as large as possible (up to 512 bytes per frame) instead of word length. This avoids delays between words. When transfer length is not 4-byte aligned, use bounce buffers to align data for DMA. TX uses aligned words for TXFIFO, while RX trims DMA buffer data after transfer completion. Introduce a new dma_package structure to store: 1. BURST_LENGTH values for each DMA request 2. Variables for DMA submission 3. DMA transmission length and actual data length Handle three cases: - len <= 512 bytes: one package, BURST_LENGTH = len * 8 - 1 - len > 512 and aligned: one package, BURST_LENGTH = max (512 bytes) - len > 512 and unaligned: two packages, second for tail data Performance test (spidev_test @10MHz, 4KB): Before: tx/rx ~6651.9 kbps After: tx/rx ~9922.2 kbps (~50% improvement) For compatibility with slow SPI devices, add configurable word delay in DMA mode. When word delay is set, dynamic burst is disabled and BURST_LENGTH equals word length. Signed-off-by: Carlos Song <[email protected]> --- drivers/spi/spi-imx.c | 413 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 377 insertions(+), 36 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 42f64d9535c9..045f4ffd680a 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -60,6 +60,7 @@ MODULE_PARM_DESC(polling_limit_us, #define MX51_ECSPI_CTRL_MAX_BURST 512 /* The maximum bytes that IMX53_ECSPI can transfer in target mode.*/ #define MX53_MAX_TRANSFER_BYTES 512 +#define BYTES_PER_32BITS_WORD 4 enum spi_imx_devtype { IMX1_CSPI, @@ -95,6 +96,16 @@ struct spi_imx_devtype_data { enum spi_imx_devtype devtype; }; +struct dma_data_package { + u32 cmd_word; + void *dma_rx_buf; + void *dma_tx_buf; + dma_addr_t dma_tx_addr; + dma_addr_t dma_rx_addr; + int dma_len; + int data_len; +}; + struct spi_imx_data { struct spi_controller *controller; struct device *dev; @@ -130,6 +141,9 @@ struct spi_imx_data { u32 wml; struct completion dma_rx_completion; struct completion dma_tx_completion; + size_t dma_package_num; + struct dma_data_package *dma_data __counted_by(dma_package_num); + int rx_offset; const struct spi_imx_devtype_data *devtype_data; }; @@ -189,6 +203,9 @@ MXC_SPI_BUF_TX(u16) MXC_SPI_BUF_RX(u32) MXC_SPI_BUF_TX(u32) +/* Align to cache line to avoid swiotlo bounce */ +#define DMA_CACHE_ALIGNED_LEN(x) ALIGN((x), dma_get_cache_alignment()) + /* First entry is reserved, second entry is valid only if SDHC_SPIEN is set * (which is currently not the case in this driver) */ @@ -253,6 +270,14 @@ static bool spi_imx_can_dma(struct spi_controller *controller, struct spi_device if (transfer->len < spi_imx->devtype_data->fifo_size) return false; + /* DMA only can transmit data in bytes */ + if (spi_imx->bits_per_word != 8 && spi_imx->bits_per_word != 16 && + spi_imx->bits_per_word != 32) + return false; + + if (transfer->len >= MAX_SDMA_BD_BYTES) + return false; + spi_imx->dynamic_burst = 0; return true; @@ -1398,8 +1423,6 @@ static int spi_imx_sdma_init(struct device *dev, struct spi_imx_data *spi_imx, init_completion(&spi_imx->dma_rx_completion); init_completion(&spi_imx->dma_tx_completion); - controller->can_dma = spi_imx_can_dma; - controller->max_dma_len = MAX_SDMA_BD_BYTES; spi_imx->controller->flags = SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX; @@ -1437,10 +1460,259 @@ static int spi_imx_calculate_timeout(struct spi_imx_data *spi_imx, int size) return secs_to_jiffies(2 * timeout); } +static void spi_imx_dma_unmap(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data) +{ + struct device *tx_dev = spi_imx->controller->dma_tx->device->dev; + struct device *rx_dev = spi_imx->controller->dma_rx->device->dev; + + dma_unmap_single(tx_dev, dma_data->dma_tx_addr, + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len), + DMA_TO_DEVICE); + dma_unmap_single(rx_dev, dma_data->dma_rx_addr, + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len), + DMA_FROM_DEVICE); +} + +static void spi_imx_dma_rx_data_handle(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data, void *rx_buf, + bool word_delay) +{ + void *copy_ptr; + int unaligned; + + /* + * On little-endian CPUs, adjust byte order: + * - Swap bytes when bpw = 8 + * - Swap half-words when bpw = 16 + * This ensures correct data ordering for DMA transfers. + */ +#ifdef __LITTLE_ENDIAN + if (!word_delay) { + unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word); + u32 *temp = dma_data->dma_rx_buf; + + for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, sizeof(*temp)); i++) { + if (bytes_per_word == 1) + swab32s(temp + i); + else if (bytes_per_word == 2) + swahw32s(temp + i); + } + } +#endif + + /* + * When dynamic burst enabled, DMA RX always receives 32-bit words from RXFIFO with + * buswidth = 4, but when data_len is not 4-bytes alignment, the RM shows when + * burst length = 32*n + m bits, a SPI burst contains the m LSB in first word and all + * 32 bits in other n words. So if garbage bytes in the first word, trim first word then + * copy the actual data to rx_buf. + */ + if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) { + unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD; + copy_ptr = (u8 *)dma_data->dma_rx_buf + BYTES_PER_32BITS_WORD - unaligned; + } else { + copy_ptr = dma_data->dma_rx_buf; + } + + memcpy(rx_buf, copy_ptr, dma_data->data_len); +} + +static int spi_imx_dma_map(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data) +{ + struct spi_controller *controller = spi_imx->controller; + struct device *tx_dev = controller->dma_tx->device->dev; + struct device *rx_dev = controller->dma_rx->device->dev; + int ret; + + dma_data->dma_tx_addr = dma_map_single(tx_dev, dma_data->dma_tx_buf, + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len), + DMA_TO_DEVICE); + ret = dma_mapping_error(tx_dev, dma_data->dma_tx_addr); + if (ret < 0) { + dev_err(spi_imx->dev, "DMA TX map failed %d\n", ret); + return ret; + } + + dma_data->dma_rx_addr = dma_map_single(rx_dev, dma_data->dma_rx_buf, + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len), + DMA_FROM_DEVICE); + ret = dma_mapping_error(rx_dev, dma_data->dma_rx_addr); + if (ret < 0) { + dev_err(spi_imx->dev, "DMA RX map failed %d\n", ret); + dma_unmap_single(tx_dev, dma_data->dma_tx_addr, + DMA_CACHE_ALIGNED_LEN(dma_data->dma_len), + DMA_TO_DEVICE); + return ret; + } + + return 0; +} + +static int spi_imx_dma_tx_data_handle(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data, + const void *tx_buf, + bool word_delay) +{ + void *copy_ptr; + int unaligned; + + if (word_delay) { + dma_data->dma_len = dma_data->data_len; + } else { + /* + * As per the reference manual, when burst length = 32*n + m bits, ECSPI + * sends m LSB bits in the first word, followed by n full 32-bit words. + * Since actual data may not be 4-byte aligned, allocate DMA TX/RX buffers + * to ensure alignment. For TX, DMA pushes 4-byte aligned words to TXFIFO, + * while ECSPI uses BURST_LENGTH settings to maintain correct bit count. + * For RX, DMA always receives 32-bit words from RXFIFO, when data len is + * not 4-byte aligned, trim the first word to drop garbage bytes, then group + * all transfer DMA bounse buffer and copy all valid data to rx_buf. + */ + dma_data->dma_len = ALIGN(dma_data->data_len, BYTES_PER_32BITS_WORD); + } + + dma_data->dma_tx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL); + if (!dma_data->dma_tx_buf) + return -ENOMEM; + + dma_data->dma_rx_buf = kzalloc(dma_data->dma_len, GFP_KERNEL); + if (!dma_data->dma_rx_buf) { + kfree(dma_data->dma_tx_buf); + return -ENOMEM; + } + + if (dma_data->data_len % BYTES_PER_32BITS_WORD && !word_delay) { + unaligned = dma_data->data_len % BYTES_PER_32BITS_WORD; + copy_ptr = (u8 *)dma_data->dma_tx_buf + BYTES_PER_32BITS_WORD - unaligned; + } else { + copy_ptr = dma_data->dma_tx_buf; + } + + memcpy(copy_ptr, tx_buf, dma_data->data_len); + + /* + * When word_delay is enabled, DMA transfers an entire word in one minor loop. + * In this case, no data requires additional handling. + */ + if (word_delay) + return 0; + +#ifdef __LITTLE_ENDIAN + /* + * On little-endian CPUs, adjust byte order: + * - Swap bytes when bpw = 8 + * - Swap half-words when bpw = 16 + * This ensures correct data ordering for DMA transfers. + */ + unsigned int bytes_per_word = spi_imx_bytes_per_word(spi_imx->bits_per_word); + u32 *temp = dma_data->dma_tx_buf; + + for (int i = 0; i < DIV_ROUND_UP(dma_data->dma_len, sizeof(*temp)); i++) { + if (bytes_per_word == 1) + swab32s(temp + i); + else if (bytes_per_word == 2) + swahw32s(temp + i); + } +#endif + + return 0; +} + +static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx, + struct spi_transfer *transfer, + bool word_delay) +{ + u32 pre_bl, tail_bl; + u32 ctrl; + int ret; + + /* + * ECSPI supports a maximum burst of 512 bytes. When xfer->len exceeds 512 + * and is not a multiple of 512, a tail transfer is required. BURST_LEGTH + * is used for SPI HW to maintain correct bit count. BURST_LENGTH should + * update with data length. After DMA request submit, SPI can not update the + * BURST_LENGTH, in this case, we must split two package, update the register + * then setup second DMA transfer. + */ + ctrl = readl(spi_imx->base + MX51_ECSPI_CTRL); + if (word_delay) { + /* + * When SPI IMX need to support word delay, according to "Sample Period Control + * Register" shows, The Sample Period Control Register (ECSPI_PERIODREG) + * provides software a way to insert delays (wait states) between consecutive + * SPI transfers. As a result, ECSPI can only transfer one word per frame, and + * the delay occurs between frames. + */ + spi_imx->dma_package_num = 1; + pre_bl = spi_imx->bits_per_word - 1; + } else if (transfer->len <= MX51_ECSPI_CTRL_MAX_BURST) { + spi_imx->dma_package_num = 1; + pre_bl = transfer->len * BITS_PER_BYTE - 1; + } else if (!(transfer->len % MX51_ECSPI_CTRL_MAX_BURST)) { + spi_imx->dma_package_num = 1; + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1; + } else { + spi_imx->dma_package_num = 2; + pre_bl = MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1; + tail_bl = (transfer->len % MX51_ECSPI_CTRL_MAX_BURST) * BITS_PER_BYTE - 1; + } + + spi_imx->dma_data = kmalloc_array(spi_imx->dma_package_num, + sizeof(struct dma_data_package), + GFP_KERNEL | __GFP_ZERO); + if (!spi_imx->dma_data) { + dev_err(spi_imx->dev, "Failed to allocate DMA package buffer!\n"); + return -ENOMEM; + } + + if (spi_imx->dma_package_num == 1) { + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK; + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET; + spi_imx->dma_data[0].cmd_word = ctrl; + spi_imx->dma_data[0].data_len = transfer->len; + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf, + word_delay); + if (ret) { + kfree(spi_imx->dma_data); + return ret; + } + } else { + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK; + ctrl |= pre_bl << MX51_ECSPI_CTRL_BL_OFFSET; + spi_imx->dma_data[0].cmd_word = ctrl; + spi_imx->dma_data[0].data_len = round_down(transfer->len, + MX51_ECSPI_CTRL_MAX_BURST); + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[0], transfer->tx_buf, + false); + if (ret) { + kfree(spi_imx->dma_data); + return ret; + } + + ctrl &= ~MX51_ECSPI_CTRL_BL_MASK; + ctrl |= tail_bl << MX51_ECSPI_CTRL_BL_OFFSET; + spi_imx->dma_data[1].cmd_word = ctrl; + spi_imx->dma_data[1].data_len = transfer->len % MX51_ECSPI_CTRL_MAX_BURST; + ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1], + transfer->tx_buf + spi_imx->dma_data[0].data_len, + false); + if (ret) { + kfree(spi_imx->dma_data[0].dma_tx_buf); + kfree(spi_imx->dma_data[0].dma_rx_buf); + kfree(spi_imx->dma_data); + } + } + + return 0; +} + static int spi_imx_dma_submit(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data, struct spi_transfer *transfer) { - struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg; struct spi_controller *controller = spi_imx->controller; struct dma_async_tx_descriptor *desc_tx, *desc_rx; unsigned long transfer_timeout; @@ -1451,9 +1723,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx, * The TX DMA setup starts the transfer, so make sure RX is configured * before TX. */ - desc_rx = dmaengine_prep_slave_sg(controller->dma_rx, - rx->sgl, rx->nents, DMA_DEV_TO_MEM, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + desc_rx = dmaengine_prep_slave_single(controller->dma_rx, dma_data->dma_rx_addr, + dma_data->dma_len, DMA_DEV_TO_MEM, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_rx) { transfer->error |= SPI_TRANS_FAIL_NO_START; return -EINVAL; @@ -1471,9 +1743,9 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx, reinit_completion(&spi_imx->dma_rx_completion); dma_async_issue_pending(controller->dma_rx); - desc_tx = dmaengine_prep_slave_sg(controller->dma_tx, - tx->sgl, tx->nents, DMA_MEM_TO_DEV, - DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + desc_tx = dmaengine_prep_slave_single(controller->dma_tx, dma_data->dma_tx_addr, + dma_data->dma_len, DMA_MEM_TO_DEV, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); if (!desc_tx) goto dmaengine_terminate_rx; @@ -1521,16 +1793,16 @@ static int spi_imx_dma_submit(struct spi_imx_data *spi_imx, } static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx, - struct spi_transfer *transfer) + struct dma_data_package *dma_data, + bool word_delay) { - struct sg_table *rx = &transfer->rx_sg; - struct scatterlist *last_sg = sg_last(rx->sgl, rx->nents); - unsigned int bytes_per_word, i; + unsigned int bytes_per_word = word_delay ? + spi_imx_bytes_per_word(spi_imx->bits_per_word) : + BYTES_PER_32BITS_WORD; + unsigned int i; - /* Get the right burst length from the last sg to ensure no tail data */ - bytes_per_word = spi_imx_bytes_per_word(transfer->bits_per_word); for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) { - if (!(sg_dma_len(last_sg) % (i * bytes_per_word))) + if (!dma_data->dma_len % (i * bytes_per_word)) break; } /* Use 1 as wml in case no available burst length got */ @@ -1540,25 +1812,29 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx, spi_imx->wml = i; } -static int spi_imx_dma_configure(struct spi_controller *controller) +static int spi_imx_dma_configure(struct spi_controller *controller, bool word_delay) { int ret; enum dma_slave_buswidth buswidth; struct dma_slave_config rx = {}, tx = {}; struct spi_imx_data *spi_imx = spi_controller_get_devdata(controller); - switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) { - case 4: + if (word_delay) { + switch (spi_imx_bytes_per_word(spi_imx->bits_per_word)) { + case 4: + buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES; + break; + case 2: + buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES; + break; + case 1: + buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE; + break; + default: + return -EINVAL; + } + } else { buswidth = DMA_SLAVE_BUSWIDTH_4_BYTES; - break; - case 2: - buswidth = DMA_SLAVE_BUSWIDTH_2_BYTES; - break; - case 1: - buswidth = DMA_SLAVE_BUSWIDTH_1_BYTE; - break; - default: - return -EINVAL; } tx.direction = DMA_MEM_TO_DEV; @@ -1584,15 +1860,17 @@ static int spi_imx_dma_configure(struct spi_controller *controller) return 0; } -static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, - struct spi_transfer *transfer) +static int spi_imx_dma_package_transfer(struct spi_imx_data *spi_imx, + struct dma_data_package *dma_data, + struct spi_transfer *transfer, + bool word_delay) { struct spi_controller *controller = spi_imx->controller; int ret; - spi_imx_dma_max_wml_find(spi_imx, transfer); + spi_imx_dma_max_wml_find(spi_imx, dma_data, word_delay); - ret = spi_imx_dma_configure(controller); + ret = spi_imx_dma_configure(controller, word_delay); if (ret) goto dma_failure_no_start; @@ -1603,10 +1881,17 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, } spi_imx->devtype_data->setup_wml(spi_imx); - ret = spi_imx_dma_submit(spi_imx, transfer); + ret = spi_imx_dma_submit(spi_imx, dma_data, transfer); if (ret) return ret; + /* Trim the DMA RX buffer and copy the actual data to rx_buf */ + dma_sync_single_for_cpu(controller->dma_rx->device->dev, dma_data->dma_rx_addr, + dma_data->dma_len, DMA_FROM_DEVICE); + spi_imx_dma_rx_data_handle(spi_imx, dma_data, transfer->rx_buf + spi_imx->rx_offset, + word_delay); + spi_imx->rx_offset += dma_data->data_len; + return 0; /* fallback to pio */ dma_failure_no_start: @@ -1614,6 +1899,57 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, return ret; } +static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx, + struct spi_transfer *transfer) +{ + bool word_delay = transfer->word_delay.value != 0; + int ret; + int i; + + ret = spi_imx_dma_data_prepare(spi_imx, transfer, word_delay); + if (ret < 0) { + transfer->error |= SPI_TRANS_FAIL_NO_START; + dev_err(spi_imx->dev, "DMA data prepare fail\n"); + goto fallback_pio; + } + + spi_imx->rx_offset = 0; + + /* Each dma_package performs a separate DMA transfer once */ + for (i = 0; i < spi_imx->dma_package_num; i++) { + ret = spi_imx_dma_map(spi_imx, &spi_imx->dma_data[i]); + if (ret < 0) { + if (i == 0) + transfer->error |= SPI_TRANS_FAIL_NO_START; + dev_err(spi_imx->dev, "DMA map fail\n"); + break; + } + + /* Update the CTRL register BL field */ + writel(spi_imx->dma_data[i].cmd_word, spi_imx->base + MX51_ECSPI_CTRL); + + ret = spi_imx_dma_package_transfer(spi_imx, &spi_imx->dma_data[i], + transfer, word_delay); + + /* Whether the dma transmission is successful or not, dma unmap is necessary */ + spi_imx_dma_unmap(spi_imx, &spi_imx->dma_data[i]); + + if (ret < 0) { + dev_dbg(spi_imx->dev, "DMA %d transfer not really finish\n", i); + break; + } + } + + for (int j = 0; j < spi_imx->dma_package_num; j++) { + kfree(spi_imx->dma_data[j].dma_tx_buf); + kfree(spi_imx->dma_data[j].dma_rx_buf); + } + kfree(spi_imx->dma_data); + +fallback_pio: + return ret; +} + static int spi_imx_pio_transfer(struct spi_device *spi, struct spi_transfer *transfer) { @@ -1780,9 +2116,14 @@ static int spi_imx_transfer_one(struct spi_controller *controller, * transfer, the SPI transfer has already been mapped, so we * have to do the DMA transfer here. */ - if (spi_imx->usedma) - return spi_imx_dma_transfer(spi_imx, transfer); - + if (spi_imx->usedma) { + ret = spi_imx_dma_transfer(spi_imx, transfer); + if (transfer->error & SPI_TRANS_FAIL_NO_START) { + spi_imx->usedma = false; + return spi_imx_pio_transfer(spi, transfer); + } + return ret; + } /* run in polling mode for short transfers */ if (transfer->len == 1 || (polling_limit_us && spi_imx_transfer_estimate_time_us(transfer) < polling_limit_us)) -- 2.34.1
